--- /dev/null
+cmake_minimum_required(VERSION 3.7.2)
+
+project(pcre2)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+set(CMAKE_C_STANDARD 99)
+
+set(PCRE2_INCLUDE_DIR ${CMAKE_BINARY_DIR}/src/libpcre2/interface)
+set(CYTHON_EXTRA_COMPILE_ARGS -DPCRE2_CODE_UNIT_WIDTH=8 -fPIC)
+
+# Set PCRE2 options.
+set(PCRE2_SUPPORT_JIT ON CACHE BOOL "" FORCE)
+set(PCRE2_NEVER_BACKSLASH_C ON CACHE BOOL "" FORCE)
+
+# Always make a release build.
+set(CMAKE_BUILD_TYPE Release)
+
+# Build PCRE2 library as both shared and static.
+set(BUILD_STATIC_LIBS ON)
+set(BUILD_SHARED_LIBS ON)
+add_subdirectory(src/libpcre2)
+
+# Build Cython code as shared.
+set(BUILD_STATIC_LIBS OFF)
+set(BUILD_SHARED_LIBS ON)
+add_subdirectory(src/pcre2)
+
+# Include PCRE2 header for Cython API.
+install(FILES ${PCRE2_INCLUDE_DIR}/pcre2.h DESTINATION src/pcre2)
--- /dev/null
+BSD 3-Clause License
+
+Copyright (c) 2022, grtetrault
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- /dev/null
+SHELL = /bin/bash
+
+init:
+ git submodule update --init --recursive
+ python3 -m venv ./.venv
+ ./.venv/bin/pip install -r ./requirements/build-requirements.txt
+ ./.venv/bin/pip install -r ./requirements/test-requirements.txt
+ ./.venv/bin/pip install .
+
+build:
+ ./.venv/bin/pip install . --force-reinstall
+
+clean:
+ rm -rf ./dist
+ rm -rf ./build
+ rm -rf ./_skbuild
+ find ./src/pcre2 -type f -name '*.c' -print0 | xargs -0 rm -vf
+ find ./src/pcre2 -type f -name '*.html' -print0 | xargs -0 rm -vf
+ find . -type f -name '*.pyc' | xargs rm -r
+ find . -type d -name '*.egg-info' | xargs rm -r
+ find . -type d -name '*.ipynb_checkpoints' | xargs rm -r
+
+purge:
+ rm -rf ./.venv
+
+benchmark:
+ ./.venv/bin/python ./benchmarks/run_regex_redux.py
--- /dev/null
+Metadata-Version: 2.4
+Name: pcre2
+Version: 0.6.0
+Summary: Python bindings for the PCRE2 regular expression library
+Home-page: https://github.com/grtetrault/pcre2.py
+Author: Garrett Tetrault
+License: BSD 3-Clause License
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Programming Language :: C
+Classifier: Programming Language :: Cython
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Operating System :: Microsoft :: Windows
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Dynamic: author
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: license
+Dynamic: license-file
+Dynamic: summary
+
+# PCRE2.py: Python bindings for the PCRE2 regular expression library
+
+This project contains Python bindings for [PCRE2](https://github.com/PCRE2Project/pcre2).
+PCRE2 is the revised API for the Perl-compatible regular expressions (PCRE) library created by Philip Hazel.
+For original source code, see the [official PCRE2 repository](https://github.com/PCRE2Project/pcre2).
+
+## Installation
+
+From PyPI:
+```
+pip install pcre2
+```
+
+If a wheel is not available for your platform, the module will be built from source.
+Building requires:
+
+* `cmake`
+* C compiler toolchain, such as `gcc` and `make`
+* `libtool`
+* Python headers
+
+## Usage
+
+This library aims to be compatible with Python's built-in `re` module. In many cases, this means
+that `pcre2` can drop-in replace `re` to gain some performance (see benchmarks below).
+However, PCRE2 and Python implement different regex specifications, so patterns and behavior will
+not always be translatable (e.g., the syntax for group replacement differs).
+
+Regular expressions are compiled with `pcre2.compile()` which accepts both unicode strings and
+bytes-like objects.
+This returns a `Pattern` object.
+Expressions can be compiled with a number of options (combined with the bitwise-or operator) and
+can be JIT compiled,
+
+```python
+>>> import pcre2
+>>> expr = r'(?<head>\w+)\s+(?<tail>\w+)'
+>>> patn = pcre2.compile(expr, flags=pcre2.I, jit=True)
+>>> # Patterns can also be JIT compiled after initialization.
+>>> patn.jit_compile()
+```
+
+Inspection of `Pattern` objects is done as follows,
+
+```python
+>>> patn.jit
+True
+>>> patn.groupindex
+{'head': 1, 'tail': 2}
+>>> patn.flags
+<CompileOption.IGNORECASE: 8>
+```
+
+Once compiled, `Pattern` objects can be used to match against strings.
+Matching return a `Match` object, which has several functions to view results,
+
+```python
+>>> subj = 'foo bar buzz bazz'
+>>> match = patn.match(subj)
+>>> match[0]
+'foo bar'
+>>> match.span()
+(0, 7)
+```
+
+Substitution is also supported, both from `Pattern` and `Match` objects,
+
+```python
+>>> repl = '$2 $1'
+>>> patn.sub(repl, subj) # Global substitutions by default.
+'bar foo bazz buzz'
+>>> patn.sub(repl, subj, count=1)
+'bar foo buzz bazz'
+>>> match.expand(repl)
+'bar foo'
+```
+
+Additionally, `Pattern` objects support scanning over subjects for all non-overlapping matches,
+
+```python
+>>> for match in patn.finditer(subj):
+... print(match.group('head'))
+...
+foo
+buzz
+```
+
+## Performance
+
+PCRE2 provides a fast regular expression library, particularly with JIT compilation enabled.
+Below are the `regex-redux` benchmark results included in this repository,
+
+| Script | Number of runs | Total time | Real time | User time | System time |
+| ------------------- | -------------- | ---------- | ---------- | ----------- | ------------- |
+| baseline.py | 10 | 3.230 | 0.323 | 0.020 | 0.100 |
+| re_vanilla.py | 10 | 51.090 | 5.109 | 11.375 | 0.530 |
+| pcre2_vanilla.py | 10 | 21.980 | 2.198 | 3.154 | 0.483 |
+| pcre2_optimized.py | 10 | 14.860 | 1.486 | 2.520 | 0.548 |
+| cffi_optimized.py | 10 | 14.130 | 1.413 | 3.111 | 0.411 |
+
+Script descriptions are as follows,
+
+| Script | Description |
+| ------------------- | -------------------------------------------------------------------- |
+| `baseline.py` | Reads input file and outputs stored expected output |
+| `re_vanilla.py` | Pure Python version |
+| `re_vanilla.py` | Same as `re_vanilla.py`, with `pcre2` drop-in replacing `re` |
+| `pcre2_module.py` | More optimized implementation using `pcre2` |
+| `cffi_optimized.py` | Manually written Python `ctypes` bindings for shared PCRE2 C library |
+
+Tests were performed on an M2 Macbook Air.
+Note that to run benchmarks locally, [Git LFS](https://git-lfs.com/) must be installed to download the input dataset.
+Additionally, a Python virtual environment must be created, and the package built
+with `make init` and `make build` respectively.
+For more information on this benchmark, see [The Computer Language Benchmarks Game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/regexredux.html).
+See source code of benchmark scripts for details and original sources.
--- /dev/null
+# PCRE2.py: Python bindings for the PCRE2 regular expression library
+
+This project contains Python bindings for [PCRE2](https://github.com/PCRE2Project/pcre2).
+PCRE2 is the revised API for the Perl-compatible regular expressions (PCRE) library created by Philip Hazel.
+For original source code, see the [official PCRE2 repository](https://github.com/PCRE2Project/pcre2).
+
+## Installation
+
+From PyPI:
+```
+pip install pcre2
+```
+
+If a wheel is not available for your platform, the module will be built from source.
+Building requires:
+
+* `cmake`
+* C compiler toolchain, such as `gcc` and `make`
+* `libtool`
+* Python headers
+
+## Usage
+
+This library aims to be compatible with Python's built-in `re` module. In many cases, this means
+that `pcre2` can drop-in replace `re` to gain some performance (see benchmarks below).
+However, PCRE2 and Python implement different regex specifications, so patterns and behavior will
+not always be translatable (e.g., the syntax for group replacement differs).
+
+Regular expressions are compiled with `pcre2.compile()` which accepts both unicode strings and
+bytes-like objects.
+This returns a `Pattern` object.
+Expressions can be compiled with a number of options (combined with the bitwise-or operator) and
+can be JIT compiled,
+
+```python
+>>> import pcre2
+>>> expr = r'(?<head>\w+)\s+(?<tail>\w+)'
+>>> patn = pcre2.compile(expr, flags=pcre2.I, jit=True)
+>>> # Patterns can also be JIT compiled after initialization.
+>>> patn.jit_compile()
+```
+
+Inspection of `Pattern` objects is done as follows,
+
+```python
+>>> patn.jit
+True
+>>> patn.groupindex
+{'head': 1, 'tail': 2}
+>>> patn.flags
+<CompileOption.IGNORECASE: 8>
+```
+
+Once compiled, `Pattern` objects can be used to match against strings.
+Matching return a `Match` object, which has several functions to view results,
+
+```python
+>>> subj = 'foo bar buzz bazz'
+>>> match = patn.match(subj)
+>>> match[0]
+'foo bar'
+>>> match.span()
+(0, 7)
+```
+
+Substitution is also supported, both from `Pattern` and `Match` objects,
+
+```python
+>>> repl = '$2 $1'
+>>> patn.sub(repl, subj) # Global substitutions by default.
+'bar foo bazz buzz'
+>>> patn.sub(repl, subj, count=1)
+'bar foo buzz bazz'
+>>> match.expand(repl)
+'bar foo'
+```
+
+Additionally, `Pattern` objects support scanning over subjects for all non-overlapping matches,
+
+```python
+>>> for match in patn.finditer(subj):
+... print(match.group('head'))
+...
+foo
+buzz
+```
+
+## Performance
+
+PCRE2 provides a fast regular expression library, particularly with JIT compilation enabled.
+Below are the `regex-redux` benchmark results included in this repository,
+
+| Script | Number of runs | Total time | Real time | User time | System time |
+| ------------------- | -------------- | ---------- | ---------- | ----------- | ------------- |
+| baseline.py | 10 | 3.230 | 0.323 | 0.020 | 0.100 |
+| re_vanilla.py | 10 | 51.090 | 5.109 | 11.375 | 0.530 |
+| pcre2_vanilla.py | 10 | 21.980 | 2.198 | 3.154 | 0.483 |
+| pcre2_optimized.py | 10 | 14.860 | 1.486 | 2.520 | 0.548 |
+| cffi_optimized.py | 10 | 14.130 | 1.413 | 3.111 | 0.411 |
+
+Script descriptions are as follows,
+
+| Script | Description |
+| ------------------- | -------------------------------------------------------------------- |
+| `baseline.py` | Reads input file and outputs stored expected output |
+| `re_vanilla.py` | Pure Python version |
+| `re_vanilla.py` | Same as `re_vanilla.py`, with `pcre2` drop-in replacing `re` |
+| `pcre2_module.py` | More optimized implementation using `pcre2` |
+| `cffi_optimized.py` | Manually written Python `ctypes` bindings for shared PCRE2 C library |
+
+Tests were performed on an M2 Macbook Air.
+Note that to run benchmarks locally, [Git LFS](https://git-lfs.com/) must be installed to download the input dataset.
+Additionally, a Python virtual environment must be created, and the package built
+with `make init` and `make build` respectively.
+For more information on this benchmark, see [The Computer Language Benchmarks Game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/regexredux.html).
+See source code of benchmark scripts for details and original sources.
--- /dev/null
+[build-system]
+requires = [
+ "setuptools>=42",
+ "scikit-build",
+ "Cython",
+ "cmake"
+]
+build-backend = "setuptools.build_meta"
--- /dev/null
+requests
+build
+wheel
+scikit-build
+cmake
+Cython
\ No newline at end of file
--- /dev/null
+twine
+pytest
+gitpython
\ No newline at end of file
--- /dev/null
+[egg_info]
+tag_build =
+tag_date = 0
+
--- /dev/null
+# -*- coding:utf-8 -*-
+
+import os
+import skbuild
+import setuptools
+
+
+def get_long_desciption():
+ cwd = os.path.abspath(os.path.dirname(__file__))
+ filename = os.path.join(cwd, "README.md")
+ with open(filename) as f:
+ long_description = f.read()
+
+ return long_description
+
+
+skbuild.setup(
+ name="pcre2",
+ version="0.6.0",
+ description="Python bindings for the PCRE2 regular expression library",
+ long_description=get_long_desciption(),
+ long_description_content_type="text/markdown",
+ license="BSD 3-Clause License",
+ author="Garrett Tetrault",
+ url="https://github.com/grtetrault/pcre2.py",
+ classifiers=[
+ "Development Status :: 4 - Beta",
+ "Intended Audience :: Developers",
+ "License :: OSI Approved :: BSD License",
+ "Programming Language :: C",
+ "Programming Language :: Cython",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: 3.13",
+ "Programming Language :: Python :: 3.14",
+ "Operating System :: MacOS :: MacOS X",
+ "Operating System :: POSIX :: Linux",
+ "Operating System :: Microsoft :: Windows",
+ ],
+ include_package_data=True,
+ packages=setuptools.find_packages("src"),
+ package_dir={"": "src"},
+ cmake_languages="C",
+)
--- /dev/null
+Metadata-Version: 2.4
+Name: pcre2
+Version: 0.6.0
+Summary: Python bindings for the PCRE2 regular expression library
+Home-page: https://github.com/grtetrault/pcre2.py
+Author: Garrett Tetrault
+License: BSD 3-Clause License
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Programming Language :: C
+Classifier: Programming Language :: Cython
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Operating System :: Microsoft :: Windows
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Dynamic: author
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: license
+Dynamic: license-file
+Dynamic: summary
+
+# PCRE2.py: Python bindings for the PCRE2 regular expression library
+
+This project contains Python bindings for [PCRE2](https://github.com/PCRE2Project/pcre2).
+PCRE2 is the revised API for the Perl-compatible regular expressions (PCRE) library created by Philip Hazel.
+For original source code, see the [official PCRE2 repository](https://github.com/PCRE2Project/pcre2).
+
+## Installation
+
+From PyPI:
+```
+pip install pcre2
+```
+
+If a wheel is not available for your platform, the module will be built from source.
+Building requires:
+
+* `cmake`
+* C compiler toolchain, such as `gcc` and `make`
+* `libtool`
+* Python headers
+
+## Usage
+
+This library aims to be compatible with Python's built-in `re` module. In many cases, this means
+that `pcre2` can drop-in replace `re` to gain some performance (see benchmarks below).
+However, PCRE2 and Python implement different regex specifications, so patterns and behavior will
+not always be translatable (e.g., the syntax for group replacement differs).
+
+Regular expressions are compiled with `pcre2.compile()` which accepts both unicode strings and
+bytes-like objects.
+This returns a `Pattern` object.
+Expressions can be compiled with a number of options (combined with the bitwise-or operator) and
+can be JIT compiled,
+
+```python
+>>> import pcre2
+>>> expr = r'(?<head>\w+)\s+(?<tail>\w+)'
+>>> patn = pcre2.compile(expr, flags=pcre2.I, jit=True)
+>>> # Patterns can also be JIT compiled after initialization.
+>>> patn.jit_compile()
+```
+
+Inspection of `Pattern` objects is done as follows,
+
+```python
+>>> patn.jit
+True
+>>> patn.groupindex
+{'head': 1, 'tail': 2}
+>>> patn.flags
+<CompileOption.IGNORECASE: 8>
+```
+
+Once compiled, `Pattern` objects can be used to match against strings.
+Matching return a `Match` object, which has several functions to view results,
+
+```python
+>>> subj = 'foo bar buzz bazz'
+>>> match = patn.match(subj)
+>>> match[0]
+'foo bar'
+>>> match.span()
+(0, 7)
+```
+
+Substitution is also supported, both from `Pattern` and `Match` objects,
+
+```python
+>>> repl = '$2 $1'
+>>> patn.sub(repl, subj) # Global substitutions by default.
+'bar foo bazz buzz'
+>>> patn.sub(repl, subj, count=1)
+'bar foo buzz bazz'
+>>> match.expand(repl)
+'bar foo'
+```
+
+Additionally, `Pattern` objects support scanning over subjects for all non-overlapping matches,
+
+```python
+>>> for match in patn.finditer(subj):
+... print(match.group('head'))
+...
+foo
+buzz
+```
+
+## Performance
+
+PCRE2 provides a fast regular expression library, particularly with JIT compilation enabled.
+Below are the `regex-redux` benchmark results included in this repository,
+
+| Script | Number of runs | Total time | Real time | User time | System time |
+| ------------------- | -------------- | ---------- | ---------- | ----------- | ------------- |
+| baseline.py | 10 | 3.230 | 0.323 | 0.020 | 0.100 |
+| re_vanilla.py | 10 | 51.090 | 5.109 | 11.375 | 0.530 |
+| pcre2_vanilla.py | 10 | 21.980 | 2.198 | 3.154 | 0.483 |
+| pcre2_optimized.py | 10 | 14.860 | 1.486 | 2.520 | 0.548 |
+| cffi_optimized.py | 10 | 14.130 | 1.413 | 3.111 | 0.411 |
+
+Script descriptions are as follows,
+
+| Script | Description |
+| ------------------- | -------------------------------------------------------------------- |
+| `baseline.py` | Reads input file and outputs stored expected output |
+| `re_vanilla.py` | Pure Python version |
+| `re_vanilla.py` | Same as `re_vanilla.py`, with `pcre2` drop-in replacing `re` |
+| `pcre2_module.py` | More optimized implementation using `pcre2` |
+| `cffi_optimized.py` | Manually written Python `ctypes` bindings for shared PCRE2 C library |
+
+Tests were performed on an M2 Macbook Air.
+Note that to run benchmarks locally, [Git LFS](https://git-lfs.com/) must be installed to download the input dataset.
+Additionally, a Python virtual environment must be created, and the package built
+with `make init` and `make build` respectively.
+For more information on this benchmark, see [The Computer Language Benchmarks Game](https://benchmarksgame-team.pages.debian.net/benchmarksgame/performance/regexredux.html).
+See source code of benchmark scripts for details and original sources.
--- /dev/null
+CMakeLists.txt
+LICENSE
+Makefile
+README.md
+pyproject.toml
+setup.py
+requirements/build-requirements.txt
+requirements/test-requirements.txt
+src/libpcre2/.editorconfig
+src/libpcre2/.git
+src/libpcre2/.gitattributes
+src/libpcre2/.gitignore
+src/libpcre2/.gitmodules
+src/libpcre2/AUTHORS.md
+src/libpcre2/BUILD.bazel
+src/libpcre2/CMakeLists.txt
+src/libpcre2/COPYING
+src/libpcre2/ChangeLog
+src/libpcre2/HACKING
+src/libpcre2/INSTALL
+src/libpcre2/LICENCE.md
+src/libpcre2/MODULE.bazel
+src/libpcre2/Makefile.am
+src/libpcre2/Makefile.in
+src/libpcre2/NEWS
+src/libpcre2/NON-AUTOTOOLS-BUILD
+src/libpcre2/README
+src/libpcre2/README.md
+src/libpcre2/RunGrepTest
+src/libpcre2/RunGrepTest.bat
+src/libpcre2/RunTest
+src/libpcre2/RunTest.bat
+src/libpcre2/SECURITY.md
+src/libpcre2/aclocal.m4
+src/libpcre2/ar-lib
+src/libpcre2/autogen.sh
+src/libpcre2/build.zig
+src/libpcre2/compile
+src/libpcre2/config.guess
+src/libpcre2/config.sub
+src/libpcre2/configure
+src/libpcre2/configure.ac
+src/libpcre2/depcomp
+src/libpcre2/install-sh
+src/libpcre2/libpcre2-16.pc.in
+src/libpcre2/libpcre2-32.pc.in
+src/libpcre2/libpcre2-8.pc.in
+src/libpcre2/libpcre2-posix.pc.in
+src/libpcre2/ltmain.sh
+src/libpcre2/missing
+src/libpcre2/pcre2-config.in
+src/libpcre2/perltest.sh
+src/libpcre2/test-driver
+src/libpcre2/.github/codecov.yml
+src/libpcre2/.github/dependabot.yml
+src/libpcre2/.github/scripts/merge_sarif.py
+src/libpcre2/.github/workflows/build.yml
+src/libpcre2/.github/workflows/cifuzz.yml
+src/libpcre2/.github/workflows/clang-analyzer.yml
+src/libpcre2/.github/workflows/codeql.yml
+src/libpcre2/.github/workflows/dev.yml
+src/libpcre2/.github/workflows/pages.yml
+src/libpcre2/.github/workflows/scorecards.yml
+src/libpcre2/.github/workflows/sync.yml
+src/libpcre2/cmake/COPYING-CMAKE-SCRIPTS
+src/libpcre2/cmake/FindEditline.cmake
+src/libpcre2/cmake/FindReadline.cmake
+src/libpcre2/cmake/PCRE2CheckVscript.cmake
+src/libpcre2/cmake/PCRE2UseSystemExtensions.cmake
+src/libpcre2/cmake/PCRE2WarningAsError.cmake
+src/libpcre2/cmake/pcre2-config.cmake.in
+src/libpcre2/deps/sljit/.git
+src/libpcre2/deps/sljit/.gitignore
+src/libpcre2/deps/sljit/API_CHANGES
+src/libpcre2/deps/sljit/CMakeLists.txt
+src/libpcre2/deps/sljit/GNUmakefile
+src/libpcre2/deps/sljit/INTERNAL_CHANGES
+src/libpcre2/deps/sljit/LICENSE
+src/libpcre2/deps/sljit/README.md
+src/libpcre2/deps/sljit/.github/workflows/actions.yml
+src/libpcre2/deps/sljit/docs/README.md
+src/libpcre2/deps/sljit/docs/general/architecture.md
+src/libpcre2/deps/sljit/docs/general/contributing.md
+src/libpcre2/deps/sljit/docs/general/introduction.md
+src/libpcre2/deps/sljit/docs/general/getting-started/_category_.json
+src/libpcre2/deps/sljit/docs/general/getting-started/configuration.md
+src/libpcre2/deps/sljit/docs/general/getting-started/setup.md
+src/libpcre2/deps/sljit/docs/general/use-cases/_category_.json
+src/libpcre2/deps/sljit/docs/general/use-cases/bytecode-interpreters.md
+src/libpcre2/deps/sljit/docs/general/use-cases/overview.md
+src/libpcre2/deps/sljit/docs/general/use-cases/pattern-matching/_category_.json
+src/libpcre2/deps/sljit/docs/general/use-cases/pattern-matching/figure1.svg
+src/libpcre2/deps/sljit/docs/general/use-cases/pattern-matching/performance-comparison.md
+src/libpcre2/deps/sljit/docs/general/use-cases/pattern-matching/regular-expression-engine-types.md
+src/libpcre2/deps/sljit/docs/general/use-cases/pattern-matching/speeding-up-pcre2-with-sljit.md
+src/libpcre2/deps/sljit/docs/tutorial/01-overview.md
+src/libpcre2/deps/sljit/docs/tutorial/02-your-first-program.md
+src/libpcre2/deps/sljit/docs/tutorial/03-branching.md
+src/libpcre2/deps/sljit/docs/tutorial/04-calling-external-functions.md
+src/libpcre2/deps/sljit/docs/tutorial/05-accessing-structures.md
+src/libpcre2/deps/sljit/docs/tutorial/06-accessing-arrays.md
+src/libpcre2/deps/sljit/docs/tutorial/07-local-variables.md
+src/libpcre2/deps/sljit/docs/tutorial/08-where-to-go-from-here.md
+src/libpcre2/deps/sljit/docs/tutorial/sources/99bottles.bf
+src/libpcre2/deps/sljit/docs/tutorial/sources/array_access.c
+src/libpcre2/deps/sljit/docs/tutorial/sources/brainfuck.c
+src/libpcre2/deps/sljit/docs/tutorial/sources/branch.c
+src/libpcre2/deps/sljit/docs/tutorial/sources/first_program.c
+src/libpcre2/deps/sljit/docs/tutorial/sources/func_call.c
+src/libpcre2/deps/sljit/docs/tutorial/sources/hello.bf
+src/libpcre2/deps/sljit/docs/tutorial/sources/loop.c
+src/libpcre2/deps/sljit/docs/tutorial/sources/struct_access.c
+src/libpcre2/deps/sljit/docs/tutorial/sources/temp_var.c
+src/libpcre2/deps/sljit/docs/website/.gitignore
+src/libpcre2/deps/sljit/docs/website/README.md
+src/libpcre2/deps/sljit/docs/website/docusaurus.config.js
+src/libpcre2/deps/sljit/docs/website/package-lock.json
+src/libpcre2/deps/sljit/docs/website/package.json
+src/libpcre2/deps/sljit/docs/website/sidebars.js
+src/libpcre2/deps/sljit/docs/website/src/components/HomepageFeatures/index.js
+src/libpcre2/deps/sljit/docs/website/src/components/HomepageFeatures/styles.module.css
+src/libpcre2/deps/sljit/docs/website/src/css/custom.css
+src/libpcre2/deps/sljit/docs/website/src/pages/index.js
+src/libpcre2/deps/sljit/docs/website/src/pages/index.module.css
+src/libpcre2/deps/sljit/docs/website/static/.nojekyll
+src/libpcre2/deps/sljit/docs/website/static/assets/regex-test.tgz
+src/libpcre2/deps/sljit/regex_src/regexJIT.c
+src/libpcre2/deps/sljit/regex_src/regexJIT.h
+src/libpcre2/deps/sljit/regex_src/regexMain.c
+src/libpcre2/deps/sljit/sljit_src/sljitConfig.h
+src/libpcre2/deps/sljit/sljit_src/sljitConfigCPU.h
+src/libpcre2/deps/sljit/sljit_src/sljitConfigInternal.h
+src/libpcre2/deps/sljit/sljit_src/sljitLir.c
+src/libpcre2/deps/sljit/sljit_src/sljitLir.h
+src/libpcre2/deps/sljit/sljit_src/sljitNativeARM_32.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeARM_64.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeARM_T2_32.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeLOONGARCH_64.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeMIPS_32.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeMIPS_64.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeMIPS_common.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativePPC_32.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativePPC_64.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativePPC_common.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeRISCV_32.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeRISCV_64.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeRISCV_common.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeS390X.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeX86_32.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeX86_64.c
+src/libpcre2/deps/sljit/sljit_src/sljitNativeX86_common.c
+src/libpcre2/deps/sljit/sljit_src/sljitSerialize.c
+src/libpcre2/deps/sljit/sljit_src/sljitUtils.c
+src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitExecAllocatorApple.c
+src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitExecAllocatorCore.c
+src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitExecAllocatorFreeBSD.c
+src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitExecAllocatorPosix.c
+src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitExecAllocatorWindows.c
+src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitProtExecAllocatorNetBSD.c
+src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitProtExecAllocatorPosix.c
+src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitWXExecAllocatorPosix.c
+src/libpcre2/deps/sljit/sljit_src/allocator_src/sljitWXExecAllocatorWindows.c
+src/libpcre2/deps/sljit/test_src/sljitConfigPost.h
+src/libpcre2/deps/sljit/test_src/sljitConfigPre.h
+src/libpcre2/deps/sljit/test_src/sljitMain.c
+src/libpcre2/deps/sljit/test_src/sljitTest.c
+src/libpcre2/deps/sljit/test_src/sljitTestBuffers.h
+src/libpcre2/deps/sljit/test_src/sljitTestCall.h
+src/libpcre2/deps/sljit/test_src/sljitTestFloat.h
+src/libpcre2/deps/sljit/test_src/sljitTestSerialize.h
+src/libpcre2/deps/sljit/test_src/sljitTestSimd.h
+src/libpcre2/doc/index.html.src
+src/libpcre2/doc/pcre2-config.1
+src/libpcre2/doc/pcre2-config.txt
+src/libpcre2/doc/pcre2.3
+src/libpcre2/doc/pcre2.txt
+src/libpcre2/doc/pcre2_callout_enumerate.3
+src/libpcre2/doc/pcre2_code_copy.3
+src/libpcre2/doc/pcre2_code_copy_with_tables.3
+src/libpcre2/doc/pcre2_code_free.3
+src/libpcre2/doc/pcre2_compile.3
+src/libpcre2/doc/pcre2_compile_context_copy.3
+src/libpcre2/doc/pcre2_compile_context_create.3
+src/libpcre2/doc/pcre2_compile_context_free.3
+src/libpcre2/doc/pcre2_config.3
+src/libpcre2/doc/pcre2_convert_context_copy.3
+src/libpcre2/doc/pcre2_convert_context_create.3
+src/libpcre2/doc/pcre2_convert_context_free.3
+src/libpcre2/doc/pcre2_converted_pattern_free.3
+src/libpcre2/doc/pcre2_dfa_match.3
+src/libpcre2/doc/pcre2_general_context_copy.3
+src/libpcre2/doc/pcre2_general_context_create.3
+src/libpcre2/doc/pcre2_general_context_free.3
+src/libpcre2/doc/pcre2_get_error_message.3
+src/libpcre2/doc/pcre2_get_mark.3
+src/libpcre2/doc/pcre2_get_match_data_heapframes_size.3
+src/libpcre2/doc/pcre2_get_match_data_size.3
+src/libpcre2/doc/pcre2_get_ovector_count.3
+src/libpcre2/doc/pcre2_get_ovector_pointer.3
+src/libpcre2/doc/pcre2_get_startchar.3
+src/libpcre2/doc/pcre2_jit_compile.3
+src/libpcre2/doc/pcre2_jit_free_unused_memory.3
+src/libpcre2/doc/pcre2_jit_match.3
+src/libpcre2/doc/pcre2_jit_stack_assign.3
+src/libpcre2/doc/pcre2_jit_stack_create.3
+src/libpcre2/doc/pcre2_jit_stack_free.3
+src/libpcre2/doc/pcre2_maketables.3
+src/libpcre2/doc/pcre2_maketables_free.3
+src/libpcre2/doc/pcre2_match.3
+src/libpcre2/doc/pcre2_match_context_copy.3
+src/libpcre2/doc/pcre2_match_context_create.3
+src/libpcre2/doc/pcre2_match_context_free.3
+src/libpcre2/doc/pcre2_match_data_create.3
+src/libpcre2/doc/pcre2_match_data_create_from_pattern.3
+src/libpcre2/doc/pcre2_match_data_free.3
+src/libpcre2/doc/pcre2_next_match.3
+src/libpcre2/doc/pcre2_pattern_convert.3
+src/libpcre2/doc/pcre2_pattern_info.3
+src/libpcre2/doc/pcre2_serialize_decode.3
+src/libpcre2/doc/pcre2_serialize_encode.3
+src/libpcre2/doc/pcre2_serialize_free.3
+src/libpcre2/doc/pcre2_serialize_get_number_of_codes.3
+src/libpcre2/doc/pcre2_set_bsr.3
+src/libpcre2/doc/pcre2_set_callout.3
+src/libpcre2/doc/pcre2_set_character_tables.3
+src/libpcre2/doc/pcre2_set_compile_extra_options.3
+src/libpcre2/doc/pcre2_set_compile_recursion_guard.3
+src/libpcre2/doc/pcre2_set_depth_limit.3
+src/libpcre2/doc/pcre2_set_glob_escape.3
+src/libpcre2/doc/pcre2_set_glob_separator.3
+src/libpcre2/doc/pcre2_set_heap_limit.3
+src/libpcre2/doc/pcre2_set_match_limit.3
+src/libpcre2/doc/pcre2_set_max_pattern_compiled_length.3
+src/libpcre2/doc/pcre2_set_max_pattern_length.3
+src/libpcre2/doc/pcre2_set_max_varlookbehind.3
+src/libpcre2/doc/pcre2_set_newline.3
+src/libpcre2/doc/pcre2_set_offset_limit.3
+src/libpcre2/doc/pcre2_set_optimize.3
+src/libpcre2/doc/pcre2_set_parens_nest_limit.3
+src/libpcre2/doc/pcre2_set_recursion_limit.3
+src/libpcre2/doc/pcre2_set_recursion_memory_management.3
+src/libpcre2/doc/pcre2_set_substitute_callout.3
+src/libpcre2/doc/pcre2_set_substitute_case_callout.3
+src/libpcre2/doc/pcre2_substitute.3
+src/libpcre2/doc/pcre2_substring_copy_byname.3
+src/libpcre2/doc/pcre2_substring_copy_bynumber.3
+src/libpcre2/doc/pcre2_substring_free.3
+src/libpcre2/doc/pcre2_substring_get_byname.3
+src/libpcre2/doc/pcre2_substring_get_bynumber.3
+src/libpcre2/doc/pcre2_substring_length_byname.3
+src/libpcre2/doc/pcre2_substring_length_bynumber.3
+src/libpcre2/doc/pcre2_substring_list_free.3
+src/libpcre2/doc/pcre2_substring_list_get.3
+src/libpcre2/doc/pcre2_substring_nametable_scan.3
+src/libpcre2/doc/pcre2_substring_number_from_name.3
+src/libpcre2/doc/pcre2api.3
+src/libpcre2/doc/pcre2build.3
+src/libpcre2/doc/pcre2callout.3
+src/libpcre2/doc/pcre2compat.3
+src/libpcre2/doc/pcre2convert.3
+src/libpcre2/doc/pcre2demo.3
+src/libpcre2/doc/pcre2grep.1
+src/libpcre2/doc/pcre2grep.txt
+src/libpcre2/doc/pcre2jit.3
+src/libpcre2/doc/pcre2limits.3
+src/libpcre2/doc/pcre2matching.3
+src/libpcre2/doc/pcre2partial.3
+src/libpcre2/doc/pcre2pattern.3
+src/libpcre2/doc/pcre2perform.3
+src/libpcre2/doc/pcre2posix.3
+src/libpcre2/doc/pcre2sample.3
+src/libpcre2/doc/pcre2serialize.3
+src/libpcre2/doc/pcre2syntax.3
+src/libpcre2/doc/pcre2test.1
+src/libpcre2/doc/pcre2test.txt
+src/libpcre2/doc/pcre2unicode.3
+src/libpcre2/doc/html/NON-AUTOTOOLS-BUILD.txt
+src/libpcre2/doc/html/README.txt
+src/libpcre2/doc/html/index.html
+src/libpcre2/doc/html/pcre2-config.html
+src/libpcre2/doc/html/pcre2.html
+src/libpcre2/doc/html/pcre2_callout_enumerate.html
+src/libpcre2/doc/html/pcre2_code_copy.html
+src/libpcre2/doc/html/pcre2_code_copy_with_tables.html
+src/libpcre2/doc/html/pcre2_code_free.html
+src/libpcre2/doc/html/pcre2_compile.html
+src/libpcre2/doc/html/pcre2_compile_context_copy.html
+src/libpcre2/doc/html/pcre2_compile_context_create.html
+src/libpcre2/doc/html/pcre2_compile_context_free.html
+src/libpcre2/doc/html/pcre2_config.html
+src/libpcre2/doc/html/pcre2_convert_context_copy.html
+src/libpcre2/doc/html/pcre2_convert_context_create.html
+src/libpcre2/doc/html/pcre2_convert_context_free.html
+src/libpcre2/doc/html/pcre2_converted_pattern_free.html
+src/libpcre2/doc/html/pcre2_dfa_match.html
+src/libpcre2/doc/html/pcre2_general_context_copy.html
+src/libpcre2/doc/html/pcre2_general_context_create.html
+src/libpcre2/doc/html/pcre2_general_context_free.html
+src/libpcre2/doc/html/pcre2_get_error_message.html
+src/libpcre2/doc/html/pcre2_get_mark.html
+src/libpcre2/doc/html/pcre2_get_match_data_heapframes_size.html
+src/libpcre2/doc/html/pcre2_get_match_data_size.html
+src/libpcre2/doc/html/pcre2_get_ovector_count.html
+src/libpcre2/doc/html/pcre2_get_ovector_pointer.html
+src/libpcre2/doc/html/pcre2_get_startchar.html
+src/libpcre2/doc/html/pcre2_jit_compile.html
+src/libpcre2/doc/html/pcre2_jit_free_unused_memory.html
+src/libpcre2/doc/html/pcre2_jit_match.html
+src/libpcre2/doc/html/pcre2_jit_stack_assign.html
+src/libpcre2/doc/html/pcre2_jit_stack_create.html
+src/libpcre2/doc/html/pcre2_jit_stack_free.html
+src/libpcre2/doc/html/pcre2_maketables.html
+src/libpcre2/doc/html/pcre2_maketables_free.html
+src/libpcre2/doc/html/pcre2_match.html
+src/libpcre2/doc/html/pcre2_match_context_copy.html
+src/libpcre2/doc/html/pcre2_match_context_create.html
+src/libpcre2/doc/html/pcre2_match_context_free.html
+src/libpcre2/doc/html/pcre2_match_data_create.html
+src/libpcre2/doc/html/pcre2_match_data_create_from_pattern.html
+src/libpcre2/doc/html/pcre2_match_data_free.html
+src/libpcre2/doc/html/pcre2_next_match.html
+src/libpcre2/doc/html/pcre2_pattern_convert.html
+src/libpcre2/doc/html/pcre2_pattern_info.html
+src/libpcre2/doc/html/pcre2_serialize_decode.html
+src/libpcre2/doc/html/pcre2_serialize_encode.html
+src/libpcre2/doc/html/pcre2_serialize_free.html
+src/libpcre2/doc/html/pcre2_serialize_get_number_of_codes.html
+src/libpcre2/doc/html/pcre2_set_bsr.html
+src/libpcre2/doc/html/pcre2_set_callout.html
+src/libpcre2/doc/html/pcre2_set_character_tables.html
+src/libpcre2/doc/html/pcre2_set_compile_extra_options.html
+src/libpcre2/doc/html/pcre2_set_compile_recursion_guard.html
+src/libpcre2/doc/html/pcre2_set_depth_limit.html
+src/libpcre2/doc/html/pcre2_set_glob_escape.html
+src/libpcre2/doc/html/pcre2_set_glob_separator.html
+src/libpcre2/doc/html/pcre2_set_heap_limit.html
+src/libpcre2/doc/html/pcre2_set_match_limit.html
+src/libpcre2/doc/html/pcre2_set_max_pattern_compiled_length.html
+src/libpcre2/doc/html/pcre2_set_max_pattern_length.html
+src/libpcre2/doc/html/pcre2_set_max_varlookbehind.html
+src/libpcre2/doc/html/pcre2_set_newline.html
+src/libpcre2/doc/html/pcre2_set_offset_limit.html
+src/libpcre2/doc/html/pcre2_set_optimize.html
+src/libpcre2/doc/html/pcre2_set_parens_nest_limit.html
+src/libpcre2/doc/html/pcre2_set_recursion_limit.html
+src/libpcre2/doc/html/pcre2_set_recursion_memory_management.html
+src/libpcre2/doc/html/pcre2_set_substitute_callout.html
+src/libpcre2/doc/html/pcre2_set_substitute_case_callout.html
+src/libpcre2/doc/html/pcre2_substitute.html
+src/libpcre2/doc/html/pcre2_substring_copy_byname.html
+src/libpcre2/doc/html/pcre2_substring_copy_bynumber.html
+src/libpcre2/doc/html/pcre2_substring_free.html
+src/libpcre2/doc/html/pcre2_substring_get_byname.html
+src/libpcre2/doc/html/pcre2_substring_get_bynumber.html
+src/libpcre2/doc/html/pcre2_substring_length_byname.html
+src/libpcre2/doc/html/pcre2_substring_length_bynumber.html
+src/libpcre2/doc/html/pcre2_substring_list_free.html
+src/libpcre2/doc/html/pcre2_substring_list_get.html
+src/libpcre2/doc/html/pcre2_substring_nametable_scan.html
+src/libpcre2/doc/html/pcre2_substring_number_from_name.html
+src/libpcre2/doc/html/pcre2api.html
+src/libpcre2/doc/html/pcre2build.html
+src/libpcre2/doc/html/pcre2callout.html
+src/libpcre2/doc/html/pcre2compat.html
+src/libpcre2/doc/html/pcre2convert.html
+src/libpcre2/doc/html/pcre2demo.html
+src/libpcre2/doc/html/pcre2grep.html
+src/libpcre2/doc/html/pcre2jit.html
+src/libpcre2/doc/html/pcre2limits.html
+src/libpcre2/doc/html/pcre2matching.html
+src/libpcre2/doc/html/pcre2partial.html
+src/libpcre2/doc/html/pcre2pattern.html
+src/libpcre2/doc/html/pcre2perform.html
+src/libpcre2/doc/html/pcre2posix.html
+src/libpcre2/doc/html/pcre2sample.html
+src/libpcre2/doc/html/pcre2serialize.html
+src/libpcre2/doc/html/pcre2syntax.html
+src/libpcre2/doc/html/pcre2test.html
+src/libpcre2/doc/html/pcre2unicode.html
+src/libpcre2/m4/ax_check_vscript.m4
+src/libpcre2/m4/ax_pthread.m4
+src/libpcre2/m4/libtool.m4
+src/libpcre2/m4/ltoptions.m4
+src/libpcre2/m4/ltsugar.m4
+src/libpcre2/m4/ltversion.m4
+src/libpcre2/m4/lt~obsolete.m4
+src/libpcre2/m4/pcre2_visibility.m4
+src/libpcre2/m4/pcre2_zos.m4
+src/libpcre2/maint/.gitignore
+src/libpcre2/maint/132html
+src/libpcre2/maint/CheckMan
+src/libpcre2/maint/CheckTxt
+src/libpcre2/maint/CleanTxt
+src/libpcre2/maint/Detrail
+src/libpcre2/maint/FilterCoverage.py
+src/libpcre2/maint/GenerateCommon.py
+src/libpcre2/maint/GenerateTest.py
+src/libpcre2/maint/GenerateUcd.py
+src/libpcre2/maint/GenerateUcpHeader.py
+src/libpcre2/maint/GenerateUcpTables.py
+src/libpcre2/maint/LintMan
+src/libpcre2/maint/ManyConfigTests
+src/libpcre2/maint/README
+src/libpcre2/maint/RunCoverage
+src/libpcre2/maint/RunManifestTest
+src/libpcre2/maint/RunManifestTest.ps1
+src/libpcre2/maint/RunPerlTest
+src/libpcre2/maint/RunSymbolTest
+src/libpcre2/maint/RunSymbolTest.ps1
+src/libpcre2/maint/UpdateAlways
+src/libpcre2/maint/UpdateCommon.py
+src/libpcre2/maint/UpdateDates.py
+src/libpcre2/maint/UpdateRelease.py
+src/libpcre2/maint/manifest-cmakeinstall-freebsd
+src/libpcre2/maint/manifest-cmakeinstall-linux
+src/libpcre2/maint/manifest-cmakeinstall-macos
+src/libpcre2/maint/manifest-cmakeinstall-solaris
+src/libpcre2/maint/manifest-cmakeinstall-windows
+src/libpcre2/maint/manifest-libpcre2-16.so
+src/libpcre2/maint/manifest-libpcre2-32.so
+src/libpcre2/maint/manifest-libpcre2-8.so
+src/libpcre2/maint/manifest-libpcre2-posix.so
+src/libpcre2/maint/manifest-makeinstall-freebsd
+src/libpcre2/maint/manifest-makeinstall-linux
+src/libpcre2/maint/manifest-makeinstall-solaris
+src/libpcre2/maint/manifest-tarball
+src/libpcre2/maint/pcre2_chartables.c.non-standard
+src/libpcre2/maint/ucptest.c
+src/libpcre2/maint/utf8.c
+src/libpcre2/maint/Unicode.tables/BidiMirroring.txt
+src/libpcre2/maint/Unicode.tables/CaseFolding.txt
+src/libpcre2/maint/Unicode.tables/DerivedBidiClass.txt
+src/libpcre2/maint/Unicode.tables/DerivedCoreProperties.txt
+src/libpcre2/maint/Unicode.tables/DerivedGeneralCategory.txt
+src/libpcre2/maint/Unicode.tables/GraphemeBreakProperty.txt
+src/libpcre2/maint/Unicode.tables/PropList.txt
+src/libpcre2/maint/Unicode.tables/PropertyAliases.txt
+src/libpcre2/maint/Unicode.tables/PropertyValueAliases.txt
+src/libpcre2/maint/Unicode.tables/ScriptExtensions.txt
+src/libpcre2/maint/Unicode.tables/Scripts.txt
+src/libpcre2/maint/Unicode.tables/UnicodeData.txt
+src/libpcre2/maint/Unicode.tables/emoji-data.txt
+src/libpcre2/maint/cmake-tests/build-interface/CMakeLists.txt
+src/libpcre2/maint/cmake-tests/build-interface/main.c
+src/libpcre2/maint/cmake-tests/install-interface/CMakeLists.txt
+src/libpcre2/maint/cmake-tests/install-interface/main.c
+src/libpcre2/maint/ucptestdata/testinput1
+src/libpcre2/maint/ucptestdata/testinput2
+src/libpcre2/maint/ucptestdata/testoutput1
+src/libpcre2/maint/ucptestdata/testoutput2
+src/libpcre2/src/config-cmake.h.in
+src/libpcre2/src/config.h.generic
+src/libpcre2/src/config.h.in
+src/libpcre2/src/libpcre2-16.sym
+src/libpcre2/src/libpcre2-32.sym
+src/libpcre2/src/libpcre2-8.sym
+src/libpcre2/src/libpcre2-posix.sym
+src/libpcre2/src/pcre2.h.generic
+src/libpcre2/src/pcre2.h.in
+src/libpcre2/src/pcre2_auto_possess.c
+src/libpcre2/src/pcre2_chartables.c.dist
+src/libpcre2/src/pcre2_chartables.c.ebcdic-1047-nl15
+src/libpcre2/src/pcre2_chartables.c.ebcdic-1047-nl25
+src/libpcre2/src/pcre2_chkdint.c
+src/libpcre2/src/pcre2_compile.c
+src/libpcre2/src/pcre2_compile.h
+src/libpcre2/src/pcre2_compile_cgroup.c
+src/libpcre2/src/pcre2_compile_class.c
+src/libpcre2/src/pcre2_config.c
+src/libpcre2/src/pcre2_context.c
+src/libpcre2/src/pcre2_convert.c
+src/libpcre2/src/pcre2_dfa_match.c
+src/libpcre2/src/pcre2_dftables.c
+src/libpcre2/src/pcre2_error.c
+src/libpcre2/src/pcre2_extuni.c
+src/libpcre2/src/pcre2_find_bracket.c
+src/libpcre2/src/pcre2_fuzzsupport.c
+src/libpcre2/src/pcre2_internal.h
+src/libpcre2/src/pcre2_intmodedep.h
+src/libpcre2/src/pcre2_jit_char_inc.h
+src/libpcre2/src/pcre2_jit_compile.c
+src/libpcre2/src/pcre2_jit_match_inc.h
+src/libpcre2/src/pcre2_jit_misc_inc.h
+src/libpcre2/src/pcre2_jit_simd_inc.h
+src/libpcre2/src/pcre2_jit_test.c
+src/libpcre2/src/pcre2_maketables.c
+src/libpcre2/src/pcre2_match.c
+src/libpcre2/src/pcre2_match_data.c
+src/libpcre2/src/pcre2_match_next.c
+src/libpcre2/src/pcre2_newline.c
+src/libpcre2/src/pcre2_ord2utf.c
+src/libpcre2/src/pcre2_pattern_info.c
+src/libpcre2/src/pcre2_printint_inc.h
+src/libpcre2/src/pcre2_script_run.c
+src/libpcre2/src/pcre2_serialize.c
+src/libpcre2/src/pcre2_string_utils.c
+src/libpcre2/src/pcre2_study.c
+src/libpcre2/src/pcre2_substitute.c
+src/libpcre2/src/pcre2_substring.c
+src/libpcre2/src/pcre2_tables.c
+src/libpcre2/src/pcre2_ucd.c
+src/libpcre2/src/pcre2_ucp.h
+src/libpcre2/src/pcre2_ucptables_inc.h
+src/libpcre2/src/pcre2_util.h
+src/libpcre2/src/pcre2_valid_utf.c
+src/libpcre2/src/pcre2_xclass.c
+src/libpcre2/src/pcre2demo.c
+src/libpcre2/src/pcre2grep.c
+src/libpcre2/src/pcre2posix.c
+src/libpcre2/src/pcre2posix.h
+src/libpcre2/src/pcre2posix_test.c
+src/libpcre2/src/pcre2test.c
+src/libpcre2/src/pcre2test_inc.h
+src/libpcre2/testdata/grepbinary
+src/libpcre2/testdata/grepfilelist
+src/libpcre2/testdata/grepinput
+src/libpcre2/testdata/grepinput3
+src/libpcre2/testdata/grepinput8
+src/libpcre2/testdata/grepinputBad8
+src/libpcre2/testdata/grepinputBad8_Trail
+src/libpcre2/testdata/grepinputC.bz2
+src/libpcre2/testdata/grepinputC.gz
+src/libpcre2/testdata/grepinputM
+src/libpcre2/testdata/grepinputUN
+src/libpcre2/testdata/grepinputv
+src/libpcre2/testdata/grepinputx
+src/libpcre2/testdata/greplist
+src/libpcre2/testdata/greplistBad
+src/libpcre2/testdata/grepnot.bz2
+src/libpcre2/testdata/grepoutput
+src/libpcre2/testdata/grepoutput8
+src/libpcre2/testdata/grepoutputC
+src/libpcre2/testdata/grepoutputCN
+src/libpcre2/testdata/grepoutputCNU
+src/libpcre2/testdata/grepoutputCU
+src/libpcre2/testdata/grepoutputCbz2
+src/libpcre2/testdata/grepoutputCgz
+src/libpcre2/testdata/grepoutputN
+src/libpcre2/testdata/grepoutputUN
+src/libpcre2/testdata/greppatN4
+src/libpcre2/testdata/testbtables
+src/libpcre2/testdata/testinput1
+src/libpcre2/testdata/testinput10
+src/libpcre2/testdata/testinput11
+src/libpcre2/testdata/testinput12
+src/libpcre2/testdata/testinput13
+src/libpcre2/testdata/testinput14
+src/libpcre2/testdata/testinput15
+src/libpcre2/testdata/testinput16
+src/libpcre2/testdata/testinput17
+src/libpcre2/testdata/testinput18
+src/libpcre2/testdata/testinput19
+src/libpcre2/testdata/testinput2
+src/libpcre2/testdata/testinput20
+src/libpcre2/testdata/testinput21
+src/libpcre2/testdata/testinput22
+src/libpcre2/testdata/testinput23
+src/libpcre2/testdata/testinput24
+src/libpcre2/testdata/testinput25
+src/libpcre2/testdata/testinput26
+src/libpcre2/testdata/testinput27
+src/libpcre2/testdata/testinput28
+src/libpcre2/testdata/testinput29
+src/libpcre2/testdata/testinput3
+src/libpcre2/testdata/testinput4
+src/libpcre2/testdata/testinput5
+src/libpcre2/testdata/testinput6
+src/libpcre2/testdata/testinput7
+src/libpcre2/testdata/testinput8
+src/libpcre2/testdata/testinput9
+src/libpcre2/testdata/testinputheap
+src/libpcre2/testdata/testoutput1
+src/libpcre2/testdata/testoutput10
+src/libpcre2/testdata/testoutput11-16
+src/libpcre2/testdata/testoutput11-32
+src/libpcre2/testdata/testoutput12-16
+src/libpcre2/testdata/testoutput12-32
+src/libpcre2/testdata/testoutput13
+src/libpcre2/testdata/testoutput14-16
+src/libpcre2/testdata/testoutput14-32
+src/libpcre2/testdata/testoutput14-8
+src/libpcre2/testdata/testoutput15
+src/libpcre2/testdata/testoutput16
+src/libpcre2/testdata/testoutput17
+src/libpcre2/testdata/testoutput18
+src/libpcre2/testdata/testoutput19
+src/libpcre2/testdata/testoutput2
+src/libpcre2/testdata/testoutput20
+src/libpcre2/testdata/testoutput21
+src/libpcre2/testdata/testoutput22-16
+src/libpcre2/testdata/testoutput22-32
+src/libpcre2/testdata/testoutput22-8
+src/libpcre2/testdata/testoutput23
+src/libpcre2/testdata/testoutput24
+src/libpcre2/testdata/testoutput25
+src/libpcre2/testdata/testoutput26
+src/libpcre2/testdata/testoutput27
+src/libpcre2/testdata/testoutput28
+src/libpcre2/testdata/testoutput29
+src/libpcre2/testdata/testoutput3
+src/libpcre2/testdata/testoutput3A
+src/libpcre2/testdata/testoutput3B
+src/libpcre2/testdata/testoutput3C
+src/libpcre2/testdata/testoutput4
+src/libpcre2/testdata/testoutput5
+src/libpcre2/testdata/testoutput6
+src/libpcre2/testdata/testoutput7
+src/libpcre2/testdata/testoutput8-16-2
+src/libpcre2/testdata/testoutput8-16-4
+src/libpcre2/testdata/testoutput8-32-4
+src/libpcre2/testdata/testoutput8-8-2
+src/libpcre2/testdata/testoutput8-8-3
+src/libpcre2/testdata/testoutput8-8-4
+src/libpcre2/testdata/testoutput9
+src/libpcre2/testdata/testoutputheap-16
+src/libpcre2/testdata/testoutputheap-32
+src/libpcre2/testdata/testoutputheap-8
+src/libpcre2/testdata/valgrind-jit.supp
+src/libpcre2/testdata/wintestinput3
+src/libpcre2/testdata/wintestoutput3
+src/libpcre2/testdata/fuzzing/pcre2_fuzzer.dict
+src/libpcre2/testdata/fuzzing/pcre2_fuzzer.options
+src/libpcre2/testdata/fuzzing/pcre2_fuzzer_16.dict
+src/libpcre2/testdata/fuzzing/pcre2_fuzzer_16.options
+src/libpcre2/testdata/fuzzing/pcre2_fuzzer_32.dict
+src/libpcre2/testdata/fuzzing/pcre2_fuzzer_32.options
+src/libpcre2/vms/configure.com
+src/libpcre2/vms/openvms_readme.txt
+src/libpcre2/vms/pcre2.h_patch
+src/libpcre2/vms/stdint.h
+src/pcre2/CMakeLists.txt
+src/pcre2/__init__.py
+src/pcre2/_cy.pyx
+src/pcre2/_libpcre2.pxd
+src/pcre2.egg-info/PKG-INFO
+src/pcre2.egg-info/SOURCES.txt
+src/pcre2.egg-info/dependency_links.txt
+src/pcre2.egg-info/top_level.txt
+tests/test_groups.py
+tests/test_match.py
+tests/test_pattern.py
+tests/test_re_compatibility.py
\ No newline at end of file
--- /dev/null
+find_package(Cython MODULE REQUIRED)
+find_package(PythonExtensions MODULE REQUIRED)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+# Build Cython with annotations.
+set(CYTHON_ANNOTATE TRUE)
+
+# Macro to add Cython files as modules, configured to build with PCRE2.
+macro(add_pyx_file filename)
+ add_cython_target(${filename} C PY3)
+ add_library(${filename} MODULE ${filename})
+ python_extension_module(${filename})
+
+ target_link_libraries(${filename} pcre2-8-static)
+ target_include_directories(${filename} PRIVATE ${PCRE2_INCLUDE_DIR})
+ target_compile_options(${filename} PRIVATE ${CYTHON_EXTRA_COMPILE_ARGS})
+
+ install(TARGETS ${filename} LIBRARY DESTINATION src/pcre2)
+endmacro()
+
+# GLOB pattern is recommended against,
+# https://cmake.org/cmake/help/v3.14/command/file.html?highlight=file#filesystem
+add_pyx_file(_cy)
+
+
+# Include .pyx and .pxd files in distribution for use by Cython API.
+install(
+ FILES
+ _libpcre2.pxd
+ _cy.pyx
+ DESTINATION
+ src/pcre2
+)
\ No newline at end of file
--- /dev/null
+from . import _cy
+
+from enum import auto, IntFlag
+import operator
+from itertools import islice
+from functools import lru_cache, reduce
+from types import MappingProxyType
+from sys import maxsize
+
+# The below implementation uses as a base that of Google`s RE2 Python bindings:
+# https://github.com/google/re2/tree/main/python
+
+
+# ============================================================================
+# Constants
+
+__version__ = "0.6.0"
+__libpcre2_version__ = _cy.__libpcre2_version__
+
+
+class RegexFlag(IntFlag):
+ # Flags either enable (True) or disable (False) PCRE2 options
+ NOFLAG = 0
+ IGNORECASE = _cy.CompileOption.CASELESS # Ignore case
+ UNICODE = _cy.CompileOption.UTF # Assume unicode "locale"
+ MULTILINE = _cy.CompileOption.MULTILINE # Make anchors look for newline
+ DOTALL = _cy.CompileOption.DOTALL # Make dot match newline
+ VERBOSE = _cy.CompileOption.EXTENDED # Ignore whitespace and comments
+
+ # No corresponding flag in PCRE2, but is the opposite of `_cy.CompileOption.UCP`
+ ASCII = auto() # ASCII-only matching for character classes
+
+
+NOFLAG = RegexFlag.NOFLAG
+ASCII = A = RegexFlag.ASCII
+IGNORECASE = I = RegexFlag.IGNORECASE
+UNICODE = U = RegexFlag.UNICODE
+MULTILINE = M = RegexFlag.MULTILINE
+DOTALL = S = RegexFlag.DOTALL
+VERBOSE = X = RegexFlag.VERBOSE
+
+
+LibraryError = _cy.LibraryError
+PatternError = error = _cy.PatternError
+
+
+# ============================================================================
+# Internal Utilities
+
+
+def _typeguard_strings(s):
+ if isinstance(s, str):
+ return str(s)
+ elif isinstance(s, (bytes, bytearray, memoryview)):
+ return bytes(s)
+ raise TypeError(f"Cannot process type {s}")
+
+
+# ============================================================================
+# Top-Level Functions
+
+
+def compile(pattern, flags=0, jit=True):
+ """
+ Compile a regular expression pattern, returning a Pattern object.
+ """
+ # Avoid recompilation if the pattern is already compiled with no option changes
+ if isinstance(pattern, Pattern):
+ if not flags == 0:
+ raise ValueError("Cannot process flags argument with a compiled pattern")
+ if pattern.jit == jit:
+ return pattern
+ # If options differ, extract the underlying string for recompilation
+ pattern = pattern.pattern
+
+ pattern = _typeguard_strings(pattern)
+ flags = RegexFlag(flags)
+
+ # Handle ASCII flag, defined as the disabling of the UCP PCRE2 option
+ options = flags & ~RegexFlag.ASCII
+ disabled_options = _cy.CompileOption.UCP if flags & RegexFlag.ASCII else 0
+
+ pcre2_code = _cy.compile(pattern, options, disabled_options)
+ if jit:
+ _cy.jit_compile(pcre2_code)
+ return Pattern(pcre2_code, pattern, flags, jit)
+
+
+def search(pattern, string, flags=0, jit=True):
+ """
+ Scan through `string` looking for a match to the pattern, returning a Match object, or None if
+ no match was found.
+ """
+ return compile(pattern, flags, jit).search(string)
+
+
+def match(pattern, string, flags=0, jit=True):
+ """
+ Match the pattern at the start of `string`, returning a Match object, or None if no match was
+ found.
+ """
+ return compile(pattern, flags, jit).match(string)
+
+
+def fullmatch(pattern, string, flags=0, jit=True):
+ """
+ Match the pattern to all of `string`, returning a Match object, or None if no match was found.
+ """
+ return compile(pattern, flags, jit).fullmatch(string)
+
+
+def finditer(pattern, string, flags=0, jit=True):
+ """
+ Return an iterator of Match objects for each non-overlapping match in the string.
+ """
+ return compile(pattern, flags, jit).finditer(string)
+
+
+def findall(pattern, string, flags=0, jit=True):
+ """
+ Return a list of all non-overlapping matches in `string`.
+
+ If one or more capture groups are present, return a list of groups for each match. Empty
+ matches are included in the result.
+ """
+ return compile(pattern, flags, jit).findall(string)
+
+
+def split(pattern, string, maxsplit=0, flags=0, jit=True):
+ """
+ Split the source string by the occurrences of the pattern, returning a list containing the
+ resulting substrings.
+
+ If capture groups are used in pattern, then the text of all groups are also returned. If
+ `maxsplit` is non-zero, at most `maxsplit` splits occur, and the remainder of `string` is
+ returned as the final element of the list.
+ """
+ return compile(pattern, flags, jit).split(string, maxsplit)
+
+
+def subn(pattern, repl, string, count=0, flags=0, jit=True):
+ """
+ Return a tuple containing `(res, number)`. `res` is the string obtained by replacing the
+ leftmost non-overlapping occurrences of the pattern in `string` by the replacement `repl`.
+ `number` is the number of substitutions that were made.
+
+ `repl` can be either a string or a callable. If it is a callable, it's passed the Match object
+ and must return a replacement string to be used.
+ """
+ return compile(pattern, flags, jit).subn(repl, string, count)
+
+
+def sub(pattern, repl, string, count=0, flags=0, jit=True):
+ """
+ Return the string obtained by replacing the leftmost non-overlapping occurrences of the pattern
+ in `string` by the replacement `repl`.
+
+ `repl` can be either a string or a callable. If it is a callable, it's passed the Match object
+ and must return a replacement string to be used.
+ """
+ return compile(pattern, flags, jit).sub(repl, string, count)
+
+
+# ============================================================================
+# Pattern Object
+
+
+class Pattern:
+ def __init__(self, pcre2_code, pattern, flags, jit):
+ if not isinstance(pcre2_code, _cy.PCRE2Code):
+ raise ValueError(
+ "PCRE2 code must be of type `_cy.PCRE2Code`. It is not recommended to instantiate "
+ "`Pattern` objects directly. Instead, use `pcre2.compile`."
+ )
+ self._pcre2_code = pcre2_code
+ self.pattern = pattern
+ self.flags = flags
+ self.jit = jit
+
+ def __getstate__(self):
+ state = self.__dict__.copy()
+ del state["_pcre2_code"] # Remove the unpicklable pointer
+ return state
+
+ def __setstate__(self, state):
+ self.__dict__.update(state)
+ # Note that patterns are recompiled - and optionally JIT compiled - when unpickling
+ self._pcre2_code = _cy.compile(self.pattern, self.flags)
+ if self.jit:
+ _cy.jit_compile(self._pcre2_code)
+
+ @property
+ @lru_cache(1)
+ def groups(self):
+ return _cy.pattern_capture_count(self._pcre2_code)
+
+ @property
+ @lru_cache(1)
+ def groupindex(self):
+ groupindex = _cy.pattern_name_dict(self._pcre2_code)
+ return MappingProxyType(groupindex)
+
+ def jit_compile(self):
+ """
+ JIT compile the pattern, or nothing if the pattern is already JIT compiled.
+ """
+ if not self.jit:
+ _cy.jit_compile(self._pcre2_code)
+ self.jit = True
+
+ def _match(self, string, pos=0, endpos=maxsize, options=0):
+ string = _typeguard_strings(string)
+ pos = max(0, min(pos, len(string)))
+ endpos = max(0, min(endpos, len(string)))
+ match_data, match_byte_offset, match_options = _cy.match(
+ self._pcre2_code, string, endpos, pos, options
+ )
+ if match_data:
+ return Match(match_data, self, string, pos, endpos, match_byte_offset, match_options)
+ return None
+
+ def search(self, string, pos=0, endpos=maxsize):
+ """
+ Scan through `string` looking for a match to the pattern, returning a Match object, or None
+ if no match was found.
+ """
+ return self._match(string, pos, endpos)
+
+ def match(self, string, pos=0, endpos=maxsize):
+ """
+ Match the pattern at the start of `string`, returning a Match object, or None if no match
+ was found.
+ """
+ return self._match(string, pos, endpos, options=_cy.MatchOption.ANCHORED)
+
+ def fullmatch(self, string, pos=0, endpos=maxsize):
+ """
+ Match the pattern to all of `string`, returning a Match object, or None if no match was
+ found.
+ """
+ options = _cy.MatchOption.ANCHORED | _cy.MatchOption.ENDANCHORED
+ return self._match(string, pos, endpos, options=options)
+
+ def finditer(self, string, pos=0, endpos=maxsize):
+ """
+ Return an iterator of Match objects for each non-overlapping match in the string.
+ """
+ string = _typeguard_strings(string)
+ pos = max(0, min(pos, len(string)))
+ endpos = max(0, min(endpos, len(string)))
+ for match_data, match_byte_offset, match_options in _cy.match_generator(
+ self._pcre2_code, string, endpos, pos
+ ):
+ yield Match(match_data, self, string, pos, endpos, match_byte_offset, match_options)
+
+ def findall(self, string, pos=0, endpos=maxsize):
+ """
+ Return a list of all non-overlapping matches in `string`.
+
+ If one or more capture groups are present, return a list of groups for each match. Empty
+ matches are included in the result.
+ """
+ string = _typeguard_strings(string)
+ empty = type(string)()
+ items = []
+ for match in self.finditer(string, pos, endpos):
+ if not self.groups:
+ item = match.group()
+ elif self.groups == 1:
+ item = match.groups(default=empty)[0]
+ else:
+ item = match.groups(default=empty)
+ items.append(item)
+ return items
+
+ def split(self, string, maxsplit=0):
+ """
+ Split the source string by the occurrences of the pattern, returning a list containing the
+ resulting substrings.
+
+ If capture groups are used in pattern, then the text of all groups are also returned. If
+ `maxsplit` is non-zero, at most `maxsplit` splits occur, and the remainder of `string` is
+ returned as the final element of the list.
+ """
+ string = _typeguard_strings(string)
+ if maxsplit < 0:
+ return [string]
+ parts = []
+ start = 0
+ for match in islice(self.finditer(string), maxsplit or None):
+ parts.append(string[start : match.start()])
+ parts.extend(map(match.__getitem__, range(1, self.groups + 1)))
+ start = match.end()
+ parts.append(string[start:])
+ return parts
+
+ def _suball(self, template, string):
+ template = _typeguard_strings(template)
+ string = _typeguard_strings(string)
+ options = _cy.SubstituteOption.GLOBAL | _cy.SubstituteOption.UNSET_EMPTY
+ byte_offset = 0
+ return _cy.substitute(self._pcre2_code, template, string, byte_offset, options=options)
+
+ def subn(self, repl, string, count=0):
+ """
+ Return a tuple containing `(res, number)`. `res` is the string obtained by replacing the
+ leftmost non-overlapping occurrences of the pattern in `string` by the replacement `repl`.
+ `number` is the number of substitutions that were made.
+
+ `repl` can be either a string or a callable. If it is a callable, it's passed the Match
+ object and must return a replacement string to be used.
+ """
+ string = _typeguard_strings(string)
+ if count < 0:
+ return (string, 0)
+
+ # Short circuit for global substitute
+ if count == 0 and not callable(repl):
+ return self._suball(repl, string)
+
+ parts = []
+ empty = type(string)()
+
+ # Pure python needed to apply callback functions
+ if callable(repl):
+ start = 0
+ numsubs = 0
+ for match in islice(self.finditer(string), count or None):
+ parts.append(string[start : match.start()])
+ parts.append(repl(match))
+ start = match.end()
+ numsubs += 1
+ parts.append(string[start:])
+ empty = type(string)()
+ return empty.join(parts), numsubs
+ else:
+ # Iterate through matches to get index of last match
+ repl = _typeguard_strings(repl)
+ end = 0
+ for match in islice(self.finditer(string), count or None):
+ end = match.end()
+ expanded, numsubs = self._suball(repl, string[:end])
+ parts = [expanded, string[end:]]
+
+ return empty.join(parts), numsubs
+
+ def sub(self, repl, string, count=0):
+ """
+ Return the string obtained by replacing the leftmost non-overlapping occurrences of the
+ pattern in `string` by the replacement `repl`.
+
+ `repl` can be either a string or a callable. If it is a callable, it's passed the Match
+ object and must return a replacement string to be used.
+ """
+ return self.subn(repl, string, count)[0]
+
+
+# ============================================================================
+# Match Object
+
+
+class Match:
+ def __init__(self, pcre2_match_data, re, string, pos, endpos, byte_offset, options):
+ if not isinstance(pcre2_match_data, _cy.PCRE2MatchData):
+ raise ValueError(
+ "PCRE2 match data must be of type `_cy.PCRE2MatchData`. It is not recommended to "
+ "instantiate `Match` objects directly. Instead, use `Pattern.match`."
+ )
+ self._pcre2_match_data = pcre2_match_data
+ self.re = re
+ self.string = string
+ self.pos = pos
+ self.endpos = endpos
+ self._byte_offset = byte_offset
+ self._options = options
+
+ def __repr__(self):
+ return (
+ f"<{self.__class__.__module__}.{self.__class__.__qualname__} object; "
+ f"span={self.span()}, match={repr(self.group())}>"
+ )
+
+ def _groupguard(self, group):
+ if isinstance(group, int):
+ if not 0 <= group <= self.re.groups:
+ raise IndexError("No such group")
+ group_number = group
+ elif isinstance(group, str):
+ if group not in self.re.groupindex:
+ raise IndexError("no such group")
+ group_number = self.re.groupindex[group]
+ elif hasattr(group, "__index__"):
+ group_number = int(group.__index__())
+ else:
+ raise IndexError("No such group")
+ return group_number
+
+ def expand(self, template):
+ """
+ Return the string obtained by substitution on the template string `template`.
+ """
+ template = _typeguard_strings(template)
+ options = (
+ self._options | _cy.SubstituteOption.REPLACEMENT_ONLY | _cy.SubstituteOption.UNSET_EMPTY
+ )
+ res, _ = _cy.substitute(
+ self.re._pcre2_code,
+ template,
+ self.string,
+ self._byte_offset,
+ options=options,
+ match_data=self._pcre2_match_data,
+ )
+ return res
+
+ def span(self, group=0):
+ """
+ Return the start and end of `group` as the tuple `(start, end)`.
+
+ If `group` did not contribute to the match, `(-1, -1)` is returned.
+ """
+ group_number = self._groupguard(group)
+ return _cy.substring_span_bynumber(self._pcre2_match_data, self.string, group_number)
+
+ def __getitem__(self, group):
+ group_number = self._groupguard(group)
+ return _cy.substring_bynumber(self._pcre2_match_data, self.string, group_number)
+
+ def group(self, *groups):
+ """
+ Returns one or more subgroups of the match.
+
+ If there is a single argument, the result is a single string. If there are multiple
+ arguments, the result is a tuple with one item per argument. Without arguments, the whole
+ match is returned.
+ """
+ if not groups:
+ groups = (0,)
+ items = map(self.__getitem__, groups)
+ return next(items) if len(groups) == 1 else tuple(items)
+
+ def groups(self, default=None):
+ """
+ Return a tuple containing all the subgroups of the match.
+ """
+ items = []
+ for group in range(1, self.re.groups + 1):
+ item = self.__getitem__(group)
+ items.append(default if item is None else item)
+ return tuple(items)
+
+ def groupdict(self, default=None):
+ """
+ Return a dictionary mapping subgroup name to group number for all the named subgroups.
+ """
+ items = []
+ for group, index in self.re.groupindex.items():
+ item = self.__getitem__(index)
+ items.append((group, default) if item is None else (group, item))
+ return dict(items)
+
+ def start(self, group=0):
+ """
+ Return the start index of the substring matched by `group`.
+ """
+ return self.span(group)[0]
+
+ def end(self, group=0):
+ """
+ Return the end index of the substring matched by `group`.
+ """
+ return self.span(group)[1]
+
+ @property
+ @lru_cache(1)
+ def lastindex(self):
+ max_end = -1
+ max_group = None
+ # We look for the rightmost right parenthesis by keeping the first group that ends at
+ # max_end because that is the leftmost/outermost group when there are nested groups!
+ for group in range(1, self.re.groups + 1):
+ end = self.end(group)
+ if max_end < end:
+ max_end = end
+ max_group = group
+ return max_group
+
+ @property
+ @lru_cache(1)
+ def lastgroup(self):
+ max_group = self.lastindex
+ if not max_group:
+ return None
+ for group, index in self.re.groupindex.items():
+ if max_group == index:
+ return group
+ return None
--- /dev/null
+# -*- coding:utf-8 -*-
+# cython: profile=True
+
+from libc.stdint cimport uint8_t, uint32_t
+from libc.stdlib cimport malloc, free
+from libc.string cimport strlen
+from cpython.unicode cimport PyUnicode_Check, PyUnicode_AsUTF8AndSize
+from cpython.bytes cimport PyBytes_Check, PyBytes_AsStringAndSize
+
+from _libpcre2 cimport *
+
+from enum import IntFlag
+
+
+__libpcre2_version__ = f"{PCRE2_MAJOR}.{PCRE2_MINOR}"
+
+
+# ============================================================================
+# Pointer Proxies
+
+# Pointer wrappers to manage lifetime and expose to Python code
+cdef class PCRE2Code:
+ cdef pcre2_code_t *ptr
+ cdef bint _pattern_is_str
+
+ @staticmethod
+ cdef PCRE2Code from_ptr(pcre2_code_t *ptr, bint pattern_is_str):
+ """ Ownership of pointer is taken by the new instance """
+ cdef PCRE2Code code
+ code = PCRE2Code.__new__(PCRE2Code)
+ code.ptr = ptr
+ code._pattern_is_str = pattern_is_str
+ return code
+
+ def __init__(self, *args, **kwargs):
+ # Prevent accidental instantiation from normal Python code
+ raise TypeError(f"Cannot create 'PCRE2Code' instances")
+
+ def __dealloc__(self):
+ if self.ptr is not NULL:
+ pcre2_code_free(self.ptr)
+
+
+cdef class PCRE2MatchData:
+ cdef pcre2_match_data_t *ptr
+
+ @staticmethod
+ cdef PCRE2MatchData from_ptr(pcre2_match_data_t *ptr):
+ """ Ownership of pointer is always taken by the new instance """
+ cdef PCRE2MatchData match_data
+ match_data = PCRE2MatchData.__new__(PCRE2MatchData)
+ match_data.ptr = ptr
+ return match_data
+
+ def __init__(self, *args, **kwargs):
+ # Prevent accidental instantiation from normal Python code
+ raise TypeError(f"Cannot create 'PCRE2MatchData' instances")
+
+ def __dealloc__(self):
+ if self.ptr is not NULL:
+ pcre2_match_data_free(self.ptr)
+
+
+# ============================================================================
+# Buffer Aquisition
+
+cdef (uint8_t *, size_t) as_sptr_and_size(object obj) except *:
+ cdef:
+ int rc
+ char *sptr = NULL
+ Py_ssize_t length = 0
+
+ # Encode unicode strings as UTF-8 buffers
+ if PyUnicode_Check(obj):
+ sptr = <char *>PyUnicode_AsUTF8AndSize(obj, &length)
+ assert(sptr is not NULL) # The function is supposed to throw on errors
+ elif PyBytes_Check(obj):
+ rc = PyBytes_AsStringAndSize(obj, &sptr, &length)
+ assert(rc == 0)
+ else:
+ raise ValueError("Only objects of type 'str' and 'bytes' are supported")
+ return <uint8_t *>sptr, length
+
+
+# ============================================================================
+# Unicode Indexing
+
+cdef size_t idx_byte_to_char(
+ uint8_t *sptr, size_t byte_idx, size_t start_byte_idx = 0, size_t start_char_idx = 0
+):
+ cdef:
+ size_t cur_byte_idx = start_byte_idx
+ size_t cur_char_idx = start_char_idx
+
+ while cur_byte_idx < byte_idx:
+ if (sptr[cur_byte_idx] & 0xC0) != 0x80:
+ cur_char_idx += 1
+ cur_byte_idx += 1
+
+ return cur_char_idx
+
+
+cdef size_t idx_char_to_byte(
+ uint8_t *sptr, size_t sptr_size,
+ size_t char_idx,
+ size_t start_byte_idx = 0,
+ size_t start_char_idx = 0,
+):
+ cdef:
+ size_t cur_byte_idx = start_byte_idx
+ size_t cur_char_idx = start_char_idx
+
+ if cur_char_idx < char_idx:
+ while cur_char_idx < char_idx:
+ if (sptr[cur_byte_idx] & 0xC0) != 0x80:
+ cur_char_idx += 1
+ cur_byte_idx += 1
+
+ while cur_byte_idx < sptr_size and (sptr[cur_byte_idx] & 0xC0) == 0x80:
+ cur_byte_idx += 1
+
+ return cur_byte_idx
+
+
+# ============================================================================
+# Exceptions
+
+class LibraryError(Exception):
+ def __init__(self, int errcode, object ctxmsg = None):
+ cdef:
+ uint8_t errmsg_sptr[120]
+ int rc
+
+ rc = pcre2_get_error_message(errcode, errmsg_sptr, sizeof(errmsg_sptr))
+ if rc == PCRE2_ERROR_NOMEMORY:
+ raise MemoryError
+ elif rc == PCRE2_ERROR_BADDATA:
+ raise ValueError(f"Unrecognized PCRE2 error code {errcode}")
+ elif rc < 0:
+ raise RuntimeError(f"Unhandled error code {rc} raised when getting error message")
+
+ # For non-negative values, return code is the length of the message
+ errmsg = errmsg_sptr[:rc].decode("UTF-8")
+ if ctxmsg:
+ errmsg = f"{ctxmsg}; {errmsg}"
+
+ super().__init__(errmsg)
+ self.msg = errmsg
+ self.code = errcode
+
+
+class PatternError(LibraryError):
+ def __init__(self, int errcode, errpos):
+ super().__init__(errcode, ctxmsg=f"compilation failed at position {errpos}")
+ self.pos = errpos
+
+
+cdef inline void raise_from_rc(int rc):
+ if rc < 0:
+ raise LibraryError(rc)
+
+
+# ============================================================================
+# Pattern Compilation
+
+
+class CompileOption(IntFlag):
+ CASELESS = PCRE2_CASELESS
+ DOTALL = PCRE2_DOTALL
+ MULTILINE = PCRE2_MULTILINE
+ EXTENDED = PCRE2_EXTENDED
+
+ # Controls the input codec (whether the input bytes are read into characters by UTF-8
+ # decoding). If the input pattern is a `str`, the default behaviour is UNICODE (and this cannot
+ # be unset). If the input pattern is a `bytes`, the default is ASCII/Latin-1 (one byte per
+ # character), but UNICODE sets this to UTF-8.
+ UTF = PCRE2_UTF
+
+ # Controls the interpretation of character values. If characters are ASCII, then (for example)
+ # '\w' does not match values outside the range 0-127. If the input pattern is a compiled with
+ # the `UTF` option (whether `str` or `bytes`), the default behaviour is `UCP` enabled; this can
+ # be disabled by the `ASCII` flag in the Python wrapper
+ UCP = PCRE2_UCP
+
+
+def compile(object pattern, uint32_t options = 0, disabled_options = 0):
+ cdef:
+ pcre2_code_t *code
+ uint8_t *patn_sptr
+ size_t patn_size
+ int rc
+ size_t errpos
+
+ # Get views into object memory
+ patn_sptr, patn_size = as_sptr_and_size(pattern)
+
+ # Lock out the use of \C which can lead to patterns matching within characters
+ options = options | PCRE2_NEVER_BACKSLASH_C
+
+ # Set Python style '\uhhhh' syntax for literal unicode characters
+ options = options | PCRE2_ALT_BSUX
+
+ # Default to UNICODE and UNICODE_PROPS for 'str' patterns and always disable these options for
+ # 'bytes' patterns
+ if PyUnicode_Check(pattern):
+ options = options | PCRE2_UTF
+
+ # Always default to Unicode property support if we are interpreting strings as Unicode for both
+ # 'str' and 'bytes' objects
+ if options & PCRE2_UTF:
+ options = options | PCRE2_UCP
+
+ # Allow for disabling any of the options set
+ options = options & ~disabled_options
+
+ code = pcre2_compile(patn_sptr, patn_size, options, &rc, &errpos, NULL)
+ if code is NULL:
+ if PyUnicode_Check(pattern):
+ errpos = idx_byte_to_char(patn_sptr, errpos)
+
+ # For some errors (e.g., unclosed groups) the whole pattern must be scanned and the error
+ # position returned is the length of the string. This means that the total range of error
+ # offset values is [0, length] inclusive
+ raise PatternError(rc, errpos)
+
+ return PCRE2Code.from_ptr(code, PyUnicode_Check(pattern))
+
+
+def jit_compile(PCRE2Code code not None):
+ raise_from_rc(pcre2_jit_compile(code.ptr, PCRE2_JIT_COMPLETE))
+
+
+# ============================================================================
+# Information Extraction
+
+def pattern_is_utf(PCRE2Code code not None):
+ cdef uint32_t all_options
+ raise_from_rc(pcre2_pattern_info(code.ptr, PCRE2_INFO_ALLOPTIONS, &all_options))
+ return bool(all_options & PCRE2_UTF)
+
+
+def pattern_capture_count(PCRE2Code code not None):
+ cdef uint32_t capture_count
+ raise_from_rc(pcre2_pattern_info(code.ptr, PCRE2_INFO_CAPTURECOUNT, &capture_count))
+ return int(capture_count)
+
+
+def pattern_name_dict(PCRE2Code code not None):
+ cdef:
+ const uint8_t *name_table
+ const uint8_t *name
+ uint32_t name_count, name_entry_size
+ int idx, offset
+ object encoding
+
+ # Get name table related information
+ raise_from_rc(pcre2_pattern_info(code.ptr, PCRE2_INFO_NAMECOUNT, &name_count))
+ raise_from_rc(pcre2_pattern_info(code.ptr, PCRE2_INFO_NAMEENTRYSIZE, &name_entry_size))
+ raise_from_rc(pcre2_pattern_info(code.ptr, PCRE2_INFO_NAMETABLE, &name_table))
+
+ encoding = "UTF-8" if pattern_is_utf(code) else "Latin-1"
+
+ # Convert byte table to dictionary mapping group names to numbers
+ name_dict = {}
+ for idx in range(name_count):
+ # Name table is structured with first two bytes of name table contain group number followed
+ # by name string (which can be assumed to be in Latin-1 for non-unicode patterns). Default
+ # builds of PCRE2 only allow ASCII character names.
+ offset = idx * name_entry_size
+ name = &name_table[offset + 2]
+ group_name = name[:strlen(<const char *>name)].decode(encoding)
+ group_number = int((name_table[offset] << 8) | name_table[offset + 1])
+ name_dict[group_name] = group_number
+
+ return name_dict
+
+
+def substring_span_bynumber(PCRE2MatchData match_data not None, object subject, size_t number):
+ cdef:
+ size_t *ovector
+ uint8_t *subj_sptr
+ size_t subj_size
+ int rc
+ size_t start
+ size_t end
+
+ # Get views into object memory
+ subj_sptr, subj_size = as_sptr_and_size(subject)
+
+ # Only perform offset lookup if group has been set
+ rc = pcre2_substring_length_bynumber(match_data.ptr, number, NULL)
+ if rc == 0:
+ ovector = pcre2_get_ovector_pointer(match_data.ptr)
+ start = ovector[2 * number]
+ end = ovector[2 * number + 1]
+
+ if PyUnicode_Check(subject):
+ start = idx_byte_to_char(subj_sptr, start)
+ end = idx_byte_to_char(subj_sptr, end)
+
+ return (start, end)
+
+ return (-1, -1)
+
+
+def substring_bynumber(PCRE2MatchData match_data not None, object subject, size_t number):
+ cdef:
+ size_t *ovector
+ uint8_t *subj_sptr
+ size_t subj_size
+ int rc
+ size_t start
+ size_t end
+
+ # Get views into object memory
+ subj_sptr, subj_size = as_sptr_and_size(subject)
+
+ # Only perform offset lookup if group has been set
+ rc = pcre2_substring_length_bynumber(match_data.ptr, number, NULL)
+ if rc == PCRE2_ERROR_UNSET:
+ return None
+ raise_from_rc(rc)
+
+ ovector = pcre2_get_ovector_pointer(match_data.ptr)
+ start = ovector[2 * number]
+ end = ovector[2 * number + 1]
+
+ res_obj = bytes(subj_sptr[start:end])
+ if PyUnicode_Check(subject):
+ res_obj = res_obj.decode("UTF-8")
+ return res_obj
+
+
+# ============================================================================
+# Matching
+
+class MatchOption(IntFlag):
+ ANCHORED = PCRE2_ANCHORED
+ ENDANCHORED = PCRE2_ENDANCHORED
+
+cdef pcre2_match_data_t * _pcre2_match_data_create_from_pattern(
+ const pcre2_code_t *code, pcre2_general_context_t *gcontext
+):
+ return pcre2_match_data_create_from_pattern(code, gcontext)
+
+cdef int _pcre2_match(
+ const pcre2_code_t *code,
+ pcre2_sptr_t subject,
+ size_t length,
+ size_t startoffset,
+ uint32_t options,
+ pcre2_match_data_t *match_data,
+ pcre2_match_context_t *mcontext
+):
+ return pcre2_match(code, subject, length, startoffset, options, match_data, mcontext)
+
+cdef PCRE2MatchData _match(
+ PCRE2Code code,
+ uint8_t *subj_sptr,
+ size_t byte_length,
+ size_t byte_offset,
+ uint32_t options,
+) except *:
+ cdef:
+ pcre2_match_data_t *match_data_ptr
+ int rc
+
+ # Allocate memory for match data, returning NULL if the memory could not be obtained
+ match_data_ptr = _pcre2_match_data_create_from_pattern(code.ptr, NULL)
+ if match_data_ptr is NULL:
+ raise MemoryError
+
+ # Attempt match of pattern onto the subject
+ rc = _pcre2_match(code.ptr, subj_sptr, byte_length, byte_offset, options, match_data_ptr, NULL)
+ if rc == PCRE2_ERROR_NOMATCH:
+ return None
+ raise_from_rc(rc)
+
+ return PCRE2MatchData.from_ptr(match_data_ptr)
+
+def match(
+ PCRE2Code code not None,
+ object subject,
+ size_t length, # length & offset in logical (index) units
+ size_t offset,
+ uint32_t options = 0,
+):
+ cdef:
+ uint8_t *subj_sptr
+ size_t subj_size
+
+ # Although the error message says "cannot use..." there would actually be nothing wrong at all
+ # with removing this block and allowing it. It's simply a matter of policy and clarity, and to
+ # match Python's re module.
+ if code._pattern_is_str ^ PyUnicode_Check(subject):
+ if code._pattern_is_str:
+ raise TypeError("Cannot use a string pattern on a bytes-like object")
+ else:
+ raise TypeError("Cannot use a bytes pattern on a string-like object")
+
+ # Get views into object memory
+ subj_sptr, subj_size = as_sptr_and_size(subject)
+
+ if PyUnicode_Check(subject):
+ # Disable UTF-8 encoding checks for improved performance
+ options |= PCRE2_NO_UTF_CHECK
+
+ length = (
+ subj_size if length == len(subject) else idx_char_to_byte(subj_sptr, subj_size, length)
+ )
+ offset = (
+ subj_size if offset == len(subject) else idx_char_to_byte(subj_sptr, subj_size, offset)
+ )
+
+ return _match(code, subj_sptr, length, offset, options), offset, options
+
+
+def match_generator(
+ PCRE2Code code not None,
+ object subject,
+ size_t length, # length & offset in logical (index) units
+ size_t offset,
+):
+ cdef:
+ uint32_t starting_options = 0
+ uint32_t state_options = 0
+ uint32_t match_options
+ size_t byte_length = length
+ size_t byte_offset = offset
+ size_t match_byte_offset
+
+ # Although the error message says "cannot use..." there would actually be nothing wrong at all
+ # with removing this block and allowing it. It's simply a matter of policy and clarity, and to
+ # match Python's re module.
+ if code._pattern_is_str ^ PyUnicode_Check(subject):
+ if code._pattern_is_str:
+ raise TypeError("Cannot use a string pattern on a bytes-like object")
+ else:
+ raise TypeError("Cannot use a bytes pattern on a string-like object")
+
+ # Get views into object memory
+ subj_sptr, subj_size = as_sptr_and_size(subject)
+
+ if PyUnicode_Check(subject):
+ # Disable UTF-8 encoding checks for improved performance
+ starting_options |= PCRE2_NO_UTF_CHECK
+
+ byte_length = (
+ subj_size if length == len(subject) else idx_char_to_byte(subj_sptr, subj_size, length)
+ )
+ byte_offset = (
+ subj_size if offset == len(subject) else idx_char_to_byte(subj_sptr, subj_size, offset)
+ )
+
+ while byte_offset <= byte_length:
+ match_options = starting_options | state_options
+ match_byte_offset = byte_offset
+ match_data = _match(code, subj_sptr, byte_length, match_byte_offset, match_options)
+ if not match_data:
+ break
+
+ else:
+ ovector = pcre2_get_ovector_pointer(match_data.ptr)
+
+ assert(match_byte_offset <= ovector[0] and ovector[0] <= ovector[1])
+ assert(ovector[1] > match_byte_offset or state_options == 0)
+
+ if ovector[0] == ovector[1]:
+ # If the matched string is empty ensure the next match makes progress
+ state_options = PCRE2_NOTEMPTY_ATSTART
+ else:
+ state_options = 0 # Reset options so empty strings can match at next offset
+
+ byte_offset = ovector[1]
+
+ yield match_data, match_byte_offset, match_options
+
+ # No need to re-match after an empty match at the end (it will just find nothing)
+ if ovector[0] == ovector[1] and ovector[1] >= byte_length:
+ break
+
+
+# ============================================================================
+# Substitution
+
+
+class SubstituteOption(IntFlag):
+ GLOBAL = PCRE2_SUBSTITUTE_GLOBAL
+ UNSET_EMPTY = PCRE2_SUBSTITUTE_UNSET_EMPTY
+ REPLACEMENT_ONLY = PCRE2_SUBSTITUTE_REPLACEMENT_ONLY
+
+def substitute(
+ PCRE2Code code not None,
+ object replacement,
+ object subject,
+ size_t byte_offset, # in bytes - unlike _cy.match()
+ uint32_t options = 0,
+ PCRE2MatchData match_data = None,
+):
+ cdef:
+ int rc
+ pcre2_match_data_t *match_data_ptr = NULL
+ uint8_t *subj_sptr
+ uint8_t *repl_sptr
+ uint8_t *res_sptr
+ size_t subj_size, repl_size, res_size
+
+ # Always compute the needed length if there is any overflow
+ options |= PCRE2_SUBSTITUTE_OVERFLOW_LENGTH
+
+ # Add support for backslash escape characters and Python substitution forms
+ options |= PCRE2_SUBSTITUTE_EXTENDED
+
+ # Although the error message says "cannot use..." there would actually be nothing wrong at all
+ # with removing this block and allowing it. It's simply a matter of policy and clarity, and to
+ # match Python's re module.
+ if code._pattern_is_str ^ PyUnicode_Check(subject):
+ if code._pattern_is_str:
+ raise TypeError("Cannot use a string pattern on a bytes-like object")
+ else:
+ raise TypeError("Cannot use a bytes pattern on a string-like object")
+
+ # Similarly, ensure that there is a match between the type of subject and replacement.
+ #
+ # Unlike the check that pattern and subject match, this one is cannot be simply removed. We
+ # pass in the PCRE2_NO_UTF_CHECK flag based on the type of subject, and that flag also affects
+ # the interpretation of replacement. So, we require a check that the replacement string is
+ # valid UTF-8, if the subject is a 'str' object (note that we could do this either by enforcing
+ # that replacement is a 'str', or by we could allow bytes as well if we do the decode here to
+ # validate it).
+ #
+ # For policy and clarity, we additionally forbid using a 'str' replacement with a 'bytes'
+ # subject, although there is no issue with that combination.
+ if PyUnicode_Check(subject) ^ PyUnicode_Check(replacement):
+ if PyUnicode_Check(subject):
+ raise TypeError("Cannot use a string subject with a bytes-like template")
+ else:
+ raise TypeError("Cannot use a bytes subject with a string-like template")
+
+ # Get views into object memory
+ repl_sptr, repl_size = as_sptr_and_size(replacement)
+ subj_sptr, subj_size = as_sptr_and_size(subject)
+
+ # Disable UTF-8 encoding checks for improved performance
+ if match_data is None and PyUnicode_Check(subject):
+ options |= PCRE2_NO_UTF_CHECK
+
+ if match_data is not None:
+ match_data_ptr = match_data.ptr
+ options |= PCRE2_SUBSTITUTE_MATCHED
+
+ # Make simple attempt at guess for required memory, unless match has already been made
+ res_size = subj_size + (subj_size // 2) if match_data is None else 0
+ res_sptr = <uint8_t *>malloc(res_size * sizeof(uint8_t))
+ try:
+ rc = pcre2_substitute(
+ code.ptr,
+ subj_sptr, subj_size,
+ byte_offset,
+ options,
+ match_data_ptr,
+ NULL,
+ repl_sptr, repl_size,
+ res_sptr, &res_size,
+ )
+ # Reattempt substitution if no memory, now with required size of buffer known
+ if rc == PCRE2_ERROR_NOMEMORY:
+ free(res_sptr)
+ res_sptr = <uint8_t *>malloc(res_size * sizeof(uint8_t))
+ rc = pcre2_substitute(
+ code.ptr,
+ subj_sptr, subj_size,
+ byte_offset,
+ options,
+ match_data_ptr,
+ NULL,
+ repl_sptr, repl_size,
+ res_sptr, &res_size,
+ )
+ raise_from_rc(rc)
+
+ # Non-error return code contains the number of substitutions made
+ res_obj = bytes(res_sptr[:res_size])
+ if PyUnicode_Check(subject):
+ # Match the type of the return object to the input object
+ res_obj = res_obj.decode("UTF-8")
+ return (res_obj, rc)
+
+ finally:
+ free(res_sptr)
--- /dev/null
+# -*- coding:utf-8 -*-
+
+from libc.stdint cimport uint8_t, uint32_t, int32_t
+
+
+cdef extern from "pcre2.h":
+ cdef unsigned int PCRE2_MAJOR
+ cdef unsigned int PCRE2_MINOR
+
+ # The following option bits can be passed to pcre2_compile(),
+ # pcre2_match(), or pcre2_dfa_match(). PCRE2_NO_UTF_CHECK affects only the
+ # function to which it is passed. Put these bits at the most significant
+ # end of the options word so others can be added next to them.
+ cdef unsigned int PCRE2_ANCHORED
+ cdef unsigned int PCRE2_NO_UTF_CHECK
+ cdef unsigned int PCRE2_ENDANCHORED
+
+ # The following option bits can be passed only to pcre2_compile(). However,
+ # they may affect compilation, JIT compilation, and/or interpretive
+ # execution. The following tags indicate which:
+ # C alters what is compiled by pcre2_compile()
+ # J alters what is compiled by pcre2_jit_compile()
+ # M is inspected during pcre2_match() execution
+ # D is inspected during pcre2_dfa_match() execution
+ cdef unsigned int PCRE2_ALLOW_EMPTY_CLASS # C
+ cdef unsigned int PCRE2_ALT_BSUX # C
+ cdef unsigned int PCRE2_AUTO_CALLOUT # C
+ cdef unsigned int PCRE2_CASELESS # C
+ cdef unsigned int PCRE2_DOLLAR_ENDONLY # J M D
+ cdef unsigned int PCRE2_DOTALL # C
+ cdef unsigned int PCRE2_DUPNAMES # C
+ cdef unsigned int PCRE2_EXTENDED # C
+ cdef unsigned int PCRE2_FIRSTLINE # J M D
+ cdef unsigned int PCRE2_MATCH_UNSET_BACKREF # C J M
+ cdef unsigned int PCRE2_MULTILINE # C
+ cdef unsigned int PCRE2_NEVER_UCP # C
+ cdef unsigned int PCRE2_NEVER_UTF # C
+ cdef unsigned int PCRE2_NO_AUTO_CAPTURE # C
+ cdef unsigned int PCRE2_NO_AUTO_POSSESS # C
+ cdef unsigned int PCRE2_NO_DOTSTAR_ANCHOR # C
+ cdef unsigned int PCRE2_NO_START_OPTIMIZE # J M D
+ cdef unsigned int PCRE2_UCP # C J M D
+ cdef unsigned int PCRE2_UNGREEDY # C
+ cdef unsigned int PCRE2_UTF # C J M D
+ cdef unsigned int PCRE2_NEVER_BACKSLASH_C # C
+ cdef unsigned int PCRE2_ALT_CIRCUMFLEX # J M D
+ cdef unsigned int PCRE2_ALT_VERBNAMES # C
+ cdef unsigned int PCRE2_USE_OFFSET_LIMIT # J M D
+ cdef unsigned int PCRE2_EXTENDED_MORE # C
+ cdef unsigned int PCRE2_LITERAL # C
+ cdef unsigned int PCRE2_MATCH_INVALID_UTF # J M D
+
+ # An additional compile options word is available in the compile context.
+ cdef unsigned int PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES # C
+ cdef unsigned int PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL # C
+ cdef unsigned int PCRE2_EXTRA_MATCH_WORD # C
+ cdef unsigned int PCRE2_EXTRA_MATCH_LINE # C
+ cdef unsigned int PCRE2_EXTRA_ESCAPED_CR_IS_LF # C
+ cdef unsigned int PCRE2_EXTRA_ALT_BSUX # C
+ cdef unsigned int PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK # C
+
+ # These are for pcre2_jit_compile().
+ cdef unsigned int PCRE2_JIT_COMPLETE # For full matching.
+ cdef unsigned int PCRE2_JIT_PARTIAL_SOFT
+ cdef unsigned int PCRE2_JIT_PARTIAL_HARD
+ cdef unsigned int PCRE2_JIT_INVALID_UTF
+
+ # These are for pcre2_match(), pcre2_dfa_match(), pcre2_jit_match(), and
+ # pcre2_substitute(). Some are allowed only for one of the functions, and
+ # in these cases it is noted below. Note that PCRE2_ANCHORED,
+ # PCRE2_ENDANCHORED and PCRE2_NO_UTF_CHECK can also be passed to these
+ # functions (though pcre2_jit_match() ignores the latter since it bypasses
+ # all sanity checks).
+ cdef unsigned int PCRE2_NOTBOL
+ cdef unsigned int PCRE2_NOTEOL
+ cdef unsigned int PCRE2_NOTEMPTY # ) These two must be kept
+ cdef unsigned int PCRE2_NOTEMPTY_ATSTART # ) adjacent to each other.
+ cdef unsigned int PCRE2_PARTIAL_SOFT
+ cdef unsigned int PCRE2_PARTIAL_HARD
+ cdef unsigned int PCRE2_DFA_RESTART # pcre2_dfa_match() only
+ cdef unsigned int PCRE2_DFA_SHORTEST # pcre2_dfa_match() only
+ cdef unsigned int PCRE2_SUBSTITUTE_GLOBAL # pcre2_substitute() only
+ cdef unsigned int PCRE2_SUBSTITUTE_EXTENDED # pcre2_substitute() only
+ cdef unsigned int PCRE2_SUBSTITUTE_UNSET_EMPTY # pcre2_substitute() only
+ cdef unsigned int PCRE2_SUBSTITUTE_UNKNOWN_UNSET # pcre2_substitute() only
+ cdef unsigned int PCRE2_SUBSTITUTE_OVERFLOW_LENGTH # pcre2_substitute() only
+ cdef unsigned int PCRE2_NO_JIT # Not for pcre2_dfa_match()
+ cdef unsigned int PCRE2_COPY_MATCHED_SUBJECT
+ cdef unsigned int PCRE2_SUBSTITUTE_LITERAL # pcre2_substitute() only
+ cdef unsigned int PCRE2_SUBSTITUTE_MATCHED # pcre2_substitute() only
+ cdef unsigned int PCRE2_SUBSTITUTE_REPLACEMENT_ONLY # pcre2_substitute() only
+
+ # Options for pcre2_pattern_convert().
+ cdef unsigned int PCRE2_CONVERT_UTF
+ cdef unsigned int PCRE2_CONVERT_NO_UTF_CHECK
+ cdef unsigned int PCRE2_CONVERT_POSIX_BASIC
+ cdef unsigned int PCRE2_CONVERT_POSIX_EXTENDED
+ cdef unsigned int PCRE2_CONVERT_GLOB
+ cdef unsigned int PCRE2_CONVERT_GLOB_NO_WILD_SEPARATOR
+ cdef unsigned int PCRE2_CONVERT_GLOB_NO_STARSTAR
+
+ # Newline and \R settings, for use in compile contexts. The newline values
+ # must be kept in step with values set in config.h and both sets must all
+ # be greater than zero.
+ cdef int PCRE2_NEWLINE_CR
+ cdef int PCRE2_NEWLINE_LF
+ cdef int PCRE2_NEWLINE_CRLF
+ cdef int PCRE2_NEWLINE_ANY
+ cdef int PCRE2_NEWLINE_ANYCRLF
+ cdef int PCRE2_NEWLINE_NUL
+
+ cdef int PCRE2_BSR_UNICODE
+ cdef int PCRE2_BSR_ANYCRLF
+
+ # Error codes for pcre2_compile(). Some of these are also used by
+ # pcre2_pattern_convert().
+ cdef int PCRE2_ERROR_END_BACKSLASH
+ cdef int PCRE2_ERROR_END_BACKSLASH_C
+ cdef int PCRE2_ERROR_UNKNOWN_ESCAPE
+ cdef int PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER
+ cdef int PCRE2_ERROR_QUANTIFIER_TOO_BIG
+ cdef int PCRE2_ERROR_MISSING_SQUARE_BRACKET
+ cdef int PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS
+ cdef int PCRE2_ERROR_CLASS_RANGE_ORDER
+ cdef int PCRE2_ERROR_QUANTIFIER_INVALID
+ cdef int PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT
+ cdef int PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY
+ cdef int PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS
+ cdef int PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING
+ cdef int PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS
+ cdef int PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE
+ cdef int PCRE2_ERROR_NULL_PATTERN
+ cdef int PCRE2_ERROR_BAD_OPTIONS
+ cdef int PCRE2_ERROR_MISSING_COMMENT_CLOSING
+ cdef int PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP
+ cdef int PCRE2_ERROR_PATTERN_TOO_LARGE
+ cdef int PCRE2_ERROR_HEAP_FAILED
+ cdef int PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS
+ cdef int PCRE2_ERROR_INTERNAL_CODE_OVERFLOW
+ cdef int PCRE2_ERROR_MISSING_CONDITION_CLOSING
+ cdef int PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH
+ cdef int PCRE2_ERROR_ZERO_RELATIVE_REFERENCE
+ cdef int PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES
+ cdef int PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED
+ cdef int PCRE2_ERROR_BAD_RELATIVE_REFERENCE
+ cdef int PCRE2_ERROR_UNKNOWN_POSIX_CLASS
+ cdef int PCRE2_ERROR_INTERNAL_STUDY_ERROR
+ cdef int PCRE2_ERROR_UNICODE_NOT_SUPPORTED
+ cdef int PCRE2_ERROR_PARENTHESES_STACK_CHECK
+ cdef int PCRE2_ERROR_CODE_POINT_TOO_BIG
+ cdef int PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED
+ cdef int PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C
+ cdef int PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE
+ cdef int PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG
+ cdef int PCRE2_ERROR_MISSING_CALLOUT_CLOSING
+ cdef int PCRE2_ERROR_ESCAPE_INVALID_IN_VERB
+ cdef int PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P
+ cdef int PCRE2_ERROR_MISSING_NAME_TERMINATOR
+ cdef int PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME
+ cdef int PCRE2_ERROR_INVALID_SUBPATTERN_NAME
+ cdef int PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE
+ cdef int PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY
+ cdef int PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY
+ cdef int PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG
+ cdef int PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS
+ cdef int PCRE2_ERROR_CLASS_INVALID_RANGE
+ cdef int PCRE2_ERROR_OCTAL_BYTE_TOO_BIG
+ cdef int PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE
+ cdef int PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN
+ cdef int PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES
+ cdef int PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE
+ cdef int PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE
+ cdef int PCRE2_ERROR_BACKSLASH_G_SYNTAX
+ cdef int PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING
+ # Error 159 is obsolete and should now never occur
+ cdef int PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED
+ cdef int PCRE2_ERROR_VERB_UNKNOWN
+ cdef int PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG
+ cdef int PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED
+ cdef int PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW
+ cdef int PCRE2_ERROR_INVALID_OCTAL
+ cdef int PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH
+ cdef int PCRE2_ERROR_MARK_MISSING_ARGUMENT
+ cdef int PCRE2_ERROR_INVALID_HEXADECIMAL
+ cdef int PCRE2_ERROR_BACKSLASH_C_SYNTAX
+ cdef int PCRE2_ERROR_BACKSLASH_K_SYNTAX
+ cdef int PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS
+ cdef int PCRE2_ERROR_BACKSLASH_N_IN_CLASS
+ cdef int PCRE2_ERROR_CALLOUT_STRING_TOO_LONG
+ cdef int PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT
+ cdef int PCRE2_ERROR_UTF_IS_DISABLED
+ cdef int PCRE2_ERROR_UCP_IS_DISABLED
+ cdef int PCRE2_ERROR_VERB_NAME_TOO_LONG
+ cdef int PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG
+ cdef int PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS
+ cdef int PCRE2_ERROR_VERSION_CONDITION_SYNTAX
+ cdef int PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS
+ cdef int PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER
+ cdef int PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER
+ cdef int PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED
+ cdef int PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP
+ cdef int PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED
+ cdef int PCRE2_ERROR_PATTERN_TOO_COMPLICATED
+ cdef int PCRE2_ERROR_LOOKBEHIND_TOO_LONG
+ cdef int PCRE2_ERROR_PATTERN_STRING_TOO_LONG
+ cdef int PCRE2_ERROR_INTERNAL_BAD_CODE
+ cdef int PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP
+ cdef int PCRE2_ERROR_NO_SURROGATES_IN_UTF16
+ cdef int PCRE2_ERROR_BAD_LITERAL_OPTIONS
+ cdef int PCRE2_ERROR_SUPPORTED_ONLY_IN_UNICODE
+ cdef int PCRE2_ERROR_INVALID_HYPHEN_IN_OPTIONS
+ cdef int PCRE2_ERROR_ALPHA_ASSERTION_UNKNOWN
+ cdef int PCRE2_ERROR_SCRIPT_RUN_NOT_AVAILABLE
+ cdef int PCRE2_ERROR_TOO_MANY_CAPTURES
+ cdef int PCRE2_ERROR_CONDITION_ATOMIC_ASSERTION_EXPECTED
+ cdef int PCRE2_ERROR_BACKSLASH_K_IN_LOOKAROUND
+
+ # "Expected" matching error codes: no match and partial match.
+ cdef int PCRE2_ERROR_NOMATCH
+ cdef int PCRE2_ERROR_PARTIAL
+
+ # Error codes for UTF-8 validity checks.
+ cdef int PCRE2_ERROR_UTF8_ERR1
+ cdef int PCRE2_ERROR_UTF8_ERR2
+ cdef int PCRE2_ERROR_UTF8_ERR3
+ cdef int PCRE2_ERROR_UTF8_ERR4
+ cdef int PCRE2_ERROR_UTF8_ERR5
+ cdef int PCRE2_ERROR_UTF8_ERR6
+ cdef int PCRE2_ERROR_UTF8_ERR7
+ cdef int PCRE2_ERROR_UTF8_ERR8
+ cdef int PCRE2_ERROR_UTF8_ERR9
+ cdef int PCRE2_ERROR_UTF8_ERR10
+ cdef int PCRE2_ERROR_UTF8_ERR11
+ cdef int PCRE2_ERROR_UTF8_ERR12
+ cdef int PCRE2_ERROR_UTF8_ERR13
+ cdef int PCRE2_ERROR_UTF8_ERR14
+ cdef int PCRE2_ERROR_UTF8_ERR15
+ cdef int PCRE2_ERROR_UTF8_ERR16
+ cdef int PCRE2_ERROR_UTF8_ERR17
+ cdef int PCRE2_ERROR_UTF8_ERR18
+ cdef int PCRE2_ERROR_UTF8_ERR19
+ cdef int PCRE2_ERROR_UTF8_ERR20
+ cdef int PCRE2_ERROR_UTF8_ERR21
+
+ # Error codes for UTF-16 validity checks.
+ cdef int PCRE2_ERROR_UTF16_ERR1
+ cdef int PCRE2_ERROR_UTF16_ERR2
+ cdef int PCRE2_ERROR_UTF16_ERR3
+
+ # Error codes for UTF-32 validity checks.
+ cdef int PCRE2_ERROR_UTF32_ERR1
+ cdef int PCRE2_ERROR_UTF32_ERR2
+
+ # Miscellaneous error codes for pcre2[_dfa]_match(), substring extraction
+ # functions, context functions, and serializing functions. They are in
+ # numerical order. Originally they were in alphabetical order too, but now
+ # that PCRE2 is released, the numbers must not be changed.
+ cdef int PCRE2_ERROR_BADDATA
+ cdef int PCRE2_ERROR_MIXEDTABLES # Name was changed.
+ cdef int PCRE2_ERROR_BADMAGIC
+ cdef int PCRE2_ERROR_BADMODE
+ cdef int PCRE2_ERROR_BADOFFSET
+ cdef int PCRE2_ERROR_BADOPTION
+ cdef int PCRE2_ERROR_BADREPLACEMENT
+ cdef int PCRE2_ERROR_BADUTFOFFSET
+ cdef int PCRE2_ERROR_CALLOUT # Never used by PCRE2 itself.
+ cdef int PCRE2_ERROR_DFA_BADRESTART
+ cdef int PCRE2_ERROR_DFA_RECURSE
+ cdef int PCRE2_ERROR_DFA_UCOND
+ cdef int PCRE2_ERROR_DFA_UFUNC
+ cdef int PCRE2_ERROR_DFA_UITEM
+ cdef int PCRE2_ERROR_DFA_WSSIZE
+ cdef int PCRE2_ERROR_INTERNAL
+ cdef int PCRE2_ERROR_JIT_BADOPTION
+ cdef int PCRE2_ERROR_JIT_STACKLIMIT
+ cdef int PCRE2_ERROR_MATCHLIMIT
+ cdef int PCRE2_ERROR_NOMEMORY
+ cdef int PCRE2_ERROR_NOSUBSTRING
+ cdef int PCRE2_ERROR_NOUNIQUESUBSTRING
+ cdef int PCRE2_ERROR_NULL
+ cdef int PCRE2_ERROR_RECURSELOOP
+ cdef int PCRE2_ERROR_DEPTHLIMIT
+ cdef int PCRE2_ERROR_RECURSIONLIMIT # Obsolete synonym.
+ cdef int PCRE2_ERROR_UNAVAILABLE
+ cdef int PCRE2_ERROR_UNSET
+ cdef int PCRE2_ERROR_BADOFFSETLIMIT
+ cdef int PCRE2_ERROR_BADREPESCAPE
+ cdef int PCRE2_ERROR_REPMISSINGBRACE
+ cdef int PCRE2_ERROR_BADSUBSTITUTION
+ cdef int PCRE2_ERROR_BADSUBSPATTERN
+ cdef int PCRE2_ERROR_TOOMANYREPLACE
+ cdef int PCRE2_ERROR_BADSERIALIZEDDATA
+ cdef int PCRE2_ERROR_HEAPLIMIT
+ cdef int PCRE2_ERROR_CONVERT_SYNTAX
+ cdef int PCRE2_ERROR_INTERNAL_DUPMATCH
+ cdef int PCRE2_ERROR_DFA_UINVALID_UTF
+
+ # Request types for pcre2_pattern_info().
+ cdef int PCRE2_INFO_ALLOPTIONS
+ cdef int PCRE2_INFO_ARGOPTIONS
+ cdef int PCRE2_INFO_BACKREFMAX
+ cdef int PCRE2_INFO_BSR
+ cdef int PCRE2_INFO_CAPTURECOUNT
+ cdef int PCRE2_INFO_FIRSTCODEUNIT
+ cdef int PCRE2_INFO_FIRSTCODETYPE
+ cdef int PCRE2_INFO_FIRSTBITMAP
+ cdef int PCRE2_INFO_HASCRORLF
+ cdef int PCRE2_INFO_JCHANGED
+ cdef int PCRE2_INFO_JITSIZE
+ cdef int PCRE2_INFO_LASTCODEUNIT
+ cdef int PCRE2_INFO_LASTCODETYPE
+ cdef int PCRE2_INFO_MATCHEMPTY
+ cdef int PCRE2_INFO_MATCHLIMIT
+ cdef int PCRE2_INFO_MAXLOOKBEHIND
+ cdef int PCRE2_INFO_MINLENGTH
+ cdef int PCRE2_INFO_NAMECOUNT
+ cdef int PCRE2_INFO_NAMEENTRYSIZE
+ cdef int PCRE2_INFO_NAMETABLE
+ cdef int PCRE2_INFO_NEWLINE
+ cdef int PCRE2_INFO_DEPTHLIMIT
+ cdef int PCRE2_INFO_RECURSIONLIMIT # Obsolete synonym
+ cdef int PCRE2_INFO_SIZE
+ cdef int PCRE2_INFO_HASBACKSLASHC
+ cdef int PCRE2_INFO_FRAMESIZE
+ cdef int PCRE2_INFO_HEAPLIMIT
+ cdef int PCRE2_INFO_EXTRAOPTIONS
+
+ # Request types for pcre2_config().
+ cdef int PCRE2_CONFIG_BSR
+ cdef int PCRE2_CONFIG_JIT
+ cdef int PCRE2_CONFIG_JITTARGET
+ cdef int PCRE2_CONFIG_LINKSIZE
+ cdef int PCRE2_CONFIG_MATCHLIMIT
+ cdef int PCRE2_CONFIG_NEWLINE
+ cdef int PCRE2_CONFIG_PARENSLIMIT
+ cdef int PCRE2_CONFIG_DEPTHLIMIT
+ cdef int PCRE2_CONFIG_RECURSIONLIMIT # Obsolete synonym
+ cdef int PCRE2_CONFIG_STACKRECURSE # Obsolete
+ cdef int PCRE2_CONFIG_UNICODE
+ cdef int PCRE2_CONFIG_UNICODE_VERSION
+ cdef int PCRE2_CONFIG_VERSION
+ cdef int PCRE2_CONFIG_HEAPLIMIT
+ cdef int PCRE2_CONFIG_NEVER_BACKSLASH_C
+ cdef int PCRE2_CONFIG_COMPILED_WIDTHS
+ cdef int PCRE2_CONFIG_TABLES_LENGTH
+
+
+ # Opaque handles for PCRE2 defined structs.
+ ctypedef struct pcre2_code_t "pcre2_code":
+ pass
+ ctypedef struct pcre2_match_data_t "pcre2_match_data":
+ pass
+ ctypedef struct pcre2_general_context_t "pcre2_general_context":
+ pass
+ ctypedef struct pcre2_compile_context_t "pcre2_compile_context":
+ pass
+ ctypedef struct pcre2_match_context_t "pcre2_match_context":
+ pass
+
+ # Basic string definition. Note that this assumes PCRE2 in compiled to
+ # support 8-bit strings.
+ ctypedef const uint8_t *pcre2_sptr_t "PCRE2_SPTR"
+
+ # Error handling functions.
+ int pcre2_get_error_message(
+ int errorcode,
+ uint8_t *buffer,
+ size_t bufflen
+ )
+
+ # Pattern compilation functions.
+ pcre2_code_t * pcre2_compile(
+ pcre2_sptr_t pattern,
+ size_t length,
+ uint32_t options,
+ int *errorcode,
+ size_t *erroroffset,
+ pcre2_compile_context_t *ccontext
+ )
+
+ int pcre2_jit_compile(
+ pcre2_code_t *code,
+ uint32_t options
+ )
+
+
+ void pcre2_code_free(pcre2_code_t *code)
+
+ # Information on compiled pattern.
+ int pcre2_pattern_info(
+ const pcre2_code_t *code,
+ uint32_t what,
+ void *where
+ )
+
+ int pcre2_substring_number_from_name(
+ const pcre2_code_t *code,
+ pcre2_sptr_t name
+ )
+
+ # Matching and match data functions.
+ pcre2_match_data_t * pcre2_match_data_create(
+ uint32_t ovecsize,
+ pcre2_general_context_t *gcontext
+ )
+
+ pcre2_match_data_t * pcre2_match_data_create_from_pattern(
+ const pcre2_code_t *code,
+ pcre2_general_context_t *gcontext
+ )
+
+ int pcre2_match(
+ const pcre2_code_t *code,
+ pcre2_sptr_t subject,
+ size_t length,
+ size_t startoffset,
+ uint32_t options,
+ pcre2_match_data_t *match_data,
+ pcre2_match_context_t *mcontext
+ )
+ int pcre2_jit_match(
+ const pcre2_code_t *code,
+ pcre2_sptr_t subject,
+ size_t length,
+ size_t startoffset,
+ uint32_t options,
+ pcre2_match_data_t *match_data,
+ pcre2_match_context_t *mcontext
+ )
+
+ void pcre2_match_data_free(pcre2_match_data_t *match_data)
+
+ uint32_t pcre2_get_ovector_count(pcre2_match_data_t *match_data)
+
+ size_t *pcre2_get_ovector_pointer(pcre2_match_data_t *match_data)
+
+ int pcre2_substring_nametable_scan(
+ const pcre2_code_t *code,
+ pcre2_sptr_t name,
+ pcre2_sptr_t *first,
+ pcre2_sptr_t *last
+ )
+
+ # String extraction from match data blocks.
+ int pcre2_substring_length_byname(
+ pcre2_match_data_t *match_data,
+ pcre2_sptr_t name,
+ size_t *bufflen
+ )
+
+ int pcre2_substring_get_byname(
+ pcre2_match_data_t *match_data,
+ pcre2_sptr_t name,
+ uint8_t **bufferptr,
+ size_t *bufflen
+ )
+
+ int pcre2_substring_length_bynumber(
+ pcre2_match_data_t *match_data,
+ uint32_t number,
+ size_t *bufflen
+ )
+
+ int pcre2_substring_get_bynumber(
+ pcre2_match_data_t *match_data,
+ uint32_t number,
+ uint8_t **bufferptr,
+ size_t *bufflen
+ )
+
+ # Substitution.
+ int pcre2_substitute(
+ const pcre2_code_t *code,
+ pcre2_sptr_t subject,
+ size_t length,
+ size_t startoffset,
+ uint32_t options,
+ pcre2_match_data_t *match_data,
+ pcre2_match_context_t *mcontext,
+ pcre2_sptr_t replacement,
+ size_t rlength,
+ uint8_t *outputbuffer,
+ size_t *outlengthptr
+ )
+
+ # Serialization.
+ int32_t pcre2_serialize_decode(
+ pcre2_code_t **codes,
+ int32_t number_of_codes,
+ const uint8_t *code_bytes,
+ pcre2_general_context_t *gcontex
+ )
+ int32_t pcre2_serialize_encode(
+ pcre2_code_t **codes,
+ int32_t number_of_codes,
+ uint8_t **serialized_bytes,
+ size_t *serialized_size,
+ pcre2_general_context_t *gcontex
+ )
+ void pcre2_serialize_free(uint8_t *bytes)
--- /dev/null
+import pytest
+import pcre2
+
+
+def test_match_groups():
+ assert pcre2.match("a", "a").groups() == ()
+ assert pcre2.match("(a)", "a").groups() == ("a",)
+
+ assert pcre2.match(b"a", b"a").groups() == ()
+ assert pcre2.match(b"(a)", b"a").groups() == (b"a",)
+
+ for a in ("\xe0", "\u0430", "\U0001d49c"):
+ assert pcre2.match(a, a).groups() == ()
+ assert pcre2.match("(%s)" % a, a).groups() == (a,)
--- /dev/null
+import pytest
+import pcre2
+import re
+
+
+# All tests should match successfully.
+test_data_match_bounds = [
+ (b".*", "aba•ba••ba•••b".encode(), 0, 0, None, 0, 0, 26),
+ (".*", "aba•ba••ba•••b", 0, 0, None, 0, 0, 14),
+ (r"\w+", "b•", 0, 0, None, 0, 0, 1),
+ (r"\w+", "b•", 0, None, None, 0, 0, 1),
+ (r"\w+", "•b", 0, 1, None, 0, 1, 2),
+ (r"\w+", "•bc", 0, 2, None, 0, 2, 3),
+ (r"\w+", "•bc", 0, 1, 2, 0, 1, 2),
+]
+
+
+@pytest.mark.parametrize("pattern,subject,flags,pos,endpos,group,start,end", test_data_match_bounds)
+def test_match_bounds(pattern, subject, flags, pos, endpos, group, start, end):
+ p = pcre2.compile(pattern, flags=flags)
+ kwargs = {}
+ if endpos is not None:
+ kwargs["endpos"] = endpos
+ if pos is not None:
+ kwargs["pos"] = pos
+ m = p.match(subject, **kwargs)
+ assert (m.start(group), m.end(group)) == (start, end)
+ if endpos is not None:
+ assert m.endpos == endpos
+ if pos is not None:
+ assert m.pos == pos
+
+
+test_data_match_substring = [
+ (b".*", "aba•ba••ba•••b".encode(), 0, 0, "aba•ba••ba•••b".encode()),
+ (".*", "aba•ba••ba•••b", 0, 0, "aba•ba••ba•••b"),
+]
+
+
+@pytest.mark.parametrize("pattern,subject,flags,pos,substring", test_data_match_substring)
+def test_match_substring(pattern, subject, flags, pos, substring):
+ p = pcre2.compile(pattern, flags=flags)
+ m = p.match(subject, pos=pos)
+ assert m[0] == substring
+
+
+test_data_match_expand = [
+ (b"[abc]+", b"$0", b"dabacbaccbacccb", 0, 0, b"abacbaccbacccb"),
+ ("[abc]+", "$0", "dabacbaccbacccb", 0, 0, "abacbaccbacccb"),
+ ("[abc]+", "$0", "dabacbaccbacccb", 0, 10, "acccb"),
+]
+
+
+@pytest.mark.parametrize("pattern,replacement,subject,flags,pos,result", test_data_match_expand)
+def test_match_expand(pattern, replacement, subject, flags, pos, result):
+ p = pcre2.compile(pattern, flags=flags)
+ m = p.search(subject, pos=pos)
+ assert m.expand(replacement) == result
--- /dev/null
+import pytest
+import pcre2
+from pcre2._cy import LibraryError
+
+
+test_data_pattern_compile_success = [
+ (b"a+b+c*d*", 0, "SUCCESS"),
+ (b"(?<foo>a+b+)c*d*", 0, "SUCCESS"),
+ (b"(?<foo>a+b+))c*d*", 0, "COMPILE_ERROR"),
+ ("å+∫+ç*∂*".encode(), 0, "SUCCESS"),
+ ("a+b+c*d*", 0, "SUCCESS"),
+ ("(?<foo>a+b+)c*d*", 0, "SUCCESS"),
+ ("(?<foo>a+b+))c*d*", 0, "COMPILE_ERROR"),
+ ("(?<<foo>a+b+)c*d*", 0, "COMPILE_ERROR"),
+ ("(?<foo>a+b+)c*d*(?<foo>a+b+)", 0, "COMPILE_ERROR"),
+ ("å+∫+ç*∂*", 0, "SUCCESS"),
+ ("(?<ƒøø>a+b+)c*d*", 0, "SUCCESS"),
+]
+
+
+@pytest.mark.parametrize("pattern,flags,return_code", test_data_pattern_compile_success)
+def test_pattern_compile_success(pattern, flags, return_code):
+ try:
+ p = pcre2.compile(pattern, flags=flags, jit=False)
+ rc = "SUCCESS"
+ assert not p.jit
+ except pcre2.PatternError:
+ rc = "COMPILE_ERROR"
+ except pcre2.LibraryError:
+ rc = "LIB_ERROR"
+ assert rc == return_code
+
+
+@pytest.mark.parametrize("pattern,flags,return_code", test_data_pattern_compile_success)
+def test_pattern_jit_compile_success(pattern, flags, return_code):
+ try:
+ p = pcre2.compile(pattern, flags=flags, jit=True)
+ rc = "SUCCESS"
+ assert p.jit
+ except pcre2.PatternError:
+ rc = "COMPILE_ERROR"
+ except pcre2.LibraryError:
+ rc = "LIB_ERROR"
+ assert rc == return_code
+
+
+test_data_pattern_groupindex = [
+ (b"(?<foo>a+b+)c*d*", 0, {"foo": 1}),
+ ("(?<foo>a+b+)c*d*", 0, {"foo": 1}),
+ ("(?<ƒøø>a+b+)c*d*", 0, {"ƒøø": 1}),
+ ("(?<foo>a+b+)c*d*(?<bar>a+b+)", 0, {"foo": 1, "bar": 2}),
+ ("(?<foo>a+b+)c*(.+)d*(?<bar>a+b+)", 0, {"foo": 1, "bar": 3}),
+]
+
+
+@pytest.mark.parametrize("pattern,flags,groupindex", test_data_pattern_groupindex)
+def test_pattern_groupindex(pattern, flags, groupindex):
+ p = pcre2.compile(pattern, flags=flags)
+ assert p.groupindex == groupindex
+
+
+test_data_pattern_match_success = [
+ (b".*", b"abacbaccbacccb", 0, 0, "SUCCESS"),
+ (".*", "abacbaccbacccb", 0, 0, "SUCCESS"),
+ ("ac{3,}b", "abacbaccbacccb", 0, 0, "SUCCESS"),
+ ("a•{3,}b", "aba•ba••ba•••b", 0, 0, "SUCCESS"),
+ ("ab", "abacbaccbacccb", 0, 2, "UNMATCHED"),
+ ("((((((((((((((()))))))))))))))", "", 0, 0, "SUCCESS"),
+]
+
+
+@pytest.mark.parametrize("pattern,subject,flags,pos,return_code", test_data_pattern_match_success)
+def test_pattern_match_success(pattern, subject, flags, pos, return_code):
+ p = pcre2.compile(pattern, flags=flags)
+ try:
+ m = p.search(subject, pos=pos)
+ rc = "SUCCESS" if m else "UNMATCHED"
+ except LibraryError as e:
+ rc = "LIB_ERROR"
+ assert rc == return_code
+
+
+test_data_pattern_scan_length = [
+ (b".+", b"abacbaccbacccb", 0, 1),
+ (b".*", b"abacbaccbacccb", 0, 2),
+ (".+", "abacbaccbacccb", 0, 1),
+ (".*", "abacbaccbacccb", 0, 2),
+ ("[abc]*", "dabacbaccbacccb", 0, 3),
+ ("ac{2,}b", "abacbaccbacccb", 0, 2),
+ ("a•{2,}b", "aba•ba••ba•••b", 0, 2),
+ ("a•*b", "aba•ba••ba•••b", 0, 4),
+ ("ab", "abacbaccbacccb", 2, 0),
+]
+
+
+@pytest.mark.parametrize("pattern,subject,pos,iter_length", test_data_pattern_scan_length)
+def test_pattern_scan_length(pattern, subject, pos, iter_length):
+ p = pcre2.compile(pattern)
+ s = p.finditer(subject, pos=pos)
+ assert len(list(iter(s))) == iter_length
+
+
+test_pattern_substitute = [
+ (b"[abc]*", b"", b"dabacbaccbacccb", 1, b"dabacbaccbacccb"),
+ ("[abc]*", "", "dabacbaccbacccb", 1, "dabacbaccbacccb"),
+ ("[abc]*", "", "dabacbaccbacccb", 0, "d"),
+ ("a(•{2,})b", "a•b", "aba•ba••ba•••b", 0, "aba•ba•ba•b"),
+ ("a(•{2,})b", "a$1b", "aba•ba••ba•••b", 0, "aba•ba••ba•••b"),
+ ("a(•{2,})b", lambda m: m[0] + m[0], "aba•ba••ba•••b", 0, "aba•ba••ba••ba•••ba•••b"),
+ ("a(•{2,})b", lambda m: m[1] + m[1], "aba•ba••ba•••b", 0, "aba•b••••••••••"),
+]
+
+
+@pytest.mark.parametrize("pattern,replacement,subject,count,result", test_pattern_substitute)
+def test_pattern_substitute(pattern, replacement, subject, count, result):
+ p = pcre2.compile(pattern)
+ assert p.sub(replacement, subject, count) == result
+
+
+def test_pattern_findall():
+ p = pcre2.compile(r"(\w+)=(\d+)")
+ assert p.findall("set width=20 and height=10") == [("width", "20"), ("height", "10")]
+ s = bytes(range(128)).decode()
+ p2 = pcre2.compile(r"[0-9--1]")
+ assert p2.findall(s) == list("-./0123456789")
+ p3 = pcre2.compile(r"[%--1]")
+ assert p3.findall(s) == list("%&'()*+,-1")
+ p4 = pcre2.compile(r"[%--]")
+ assert p4.findall(s) == list("%&'()*+,-")
+ p5 = pcre2.compile(r"[0-9&&1]")
+ assert p5.findall(s) == list("&0123456789")
+ p6 = pcre2.compile(r"[\d&&1]")
+ assert p6.findall(s) == list("&0123456789")
+ p7 = pcre2.compile(r"[0-9||a]")
+ assert p7.findall(s) == list("0123456789a|")
+ p8 = pcre2.compile(r"[\d||a]")
+ assert p8.findall(s) == list("0123456789a|")
+ p9 = pcre2.compile(r"[0-9~~1]")
+ assert p9.findall(s) == list("0123456789~")
+ p10 = pcre2.compile(r"[\d~~1]")
+ assert p10.findall(s) == list("0123456789~")
+ p11 = pcre2.compile(r"[[0-9]|]")
+ assert p11.findall(s) == list("0123456789[]")
+
+ for reps in "*", "+", "?", "{1}":
+ for mod in "", "?":
+ pattern = "." + reps + mod + "yz"
+ assert pcre2.compile(pattern, pcre2.S).findall("xyz") == ["xyz"], pattern
+ pattern = pattern.encode()
+ assert pcre2.compile(pattern, pcre2.S).findall(b"xyz") == [b"xyz"], pattern
+
+
+def test_pattern_jit_findall():
+ assert pcre2.findall(r"(\w+)=(\d+)", "set width=20 and height=10") == [
+ ("width", "20"),
+ ("height", "10"),
+ ]
+ assert pcre2.findall(":+", "abc") == []
+ assert pcre2.findall(":+", "a:b::c:::d") == [":", "::", ":::"]
+ assert pcre2.findall("(:+)", "a:b::c:::d") == [":", "::", ":::"]
+
+ for x in ("\xe0", "\u0430", "\U0001d49c"):
+ xx = x * 2
+ xxx = x * 3
+ string = "a%sb%sc%sd" % (x, xx, xxx)
+ assert pcre2.findall("%s+" % x, string) == [x, xx, xxx]
+ assert pcre2.findall("(%s+)" % x, string) == [x, xx, xxx]
+
+ assert len(pcre2.findall(r"\b", "a")) == 2
+ assert len(pcre2.findall(r"\B", "a")) == 0
+ assert len(pcre2.findall(r"\b", " ")) == 0
+ assert len(pcre2.findall(r"\b", " ")) == 0
+ assert len(pcre2.findall(r"\B", " ")) == 2
+
+ s = bytes(range(128)).decode()
+ assert pcre2.findall(r"[--1]", s) == list("-./01")
+ assert pcre2.findall(r"[&&1]", s) == list("&1")
+ assert pcre2.findall(r"[||1]", s) == list("1|")
+ assert pcre2.findall(r"[~~1]", s) == list("1~")
+
+ assert pcre2.findall(r"(?i)(a)\1", "aa \u0100") == ["a"]
+
+ assert pcre2.findall(r"a++", "aab") == ["aa"]
+ assert pcre2.findall(r"a*+", "aab") == ["aa", "", ""]
+ assert pcre2.findall(r"a?+", "aab") == ["a", "a", "", ""]
+ assert pcre2.findall(r"a{1,3}+", "aab") == ["aa"]
+
+ assert pcre2.findall(r"(?:ab)++", "ababc") == ["abab"]
+ assert pcre2.findall(r"(?:ab)*+", "ababc") == ["abab", "", ""]
+ assert pcre2.findall(r"(?:ab)?+", "ababc") == ["ab", "ab", "", ""]
+ assert pcre2.findall(r"(?:ab){1,3}+", "ababc") == ["abab"]
+
+ assert pcre2.findall(r"(?>a+)", "aab") == ["aa"]
+ assert pcre2.findall(r"(?>a*)", "aab") == ["aa", "", ""]
+ assert pcre2.findall(r"(?>a?)", "aab") == ["a", "a", "", ""]
+ assert pcre2.findall(r"(?>a{1,3})", "aab") == ["aa"]
+
+ assert pcre2.findall(r"(?>(?:ab)+)", "ababc") == ["abab"]
+ assert pcre2.findall(r"(?>(?:ab)*)", "ababc") == ["abab", "", ""]
+ assert pcre2.findall(r"(?>(?:ab)?)", "ababc") == ["ab", "ab", "", ""]
+ assert pcre2.findall(r"(?>(?:ab){1,3})", "ababc") == ["abab"]
+
+ import re
+
+ b = "y\u2620y\u2620y".encode("utf-8")
+ assert len(pcre2.findall(re.escape("\u2620".encode("utf-8")), b)) == 2
+
+
+def test_pattern_split():
+ pattern = "[\u002e\u3002\uff0e\uff61]"
+ assert pcre2.compile(pattern).split("a.b.c") == ["a", "b", "c"]
+
+
+def test_pattern_jit_split():
+ assert pcre2.split(":", ":a:b::c") == ["", "a", "b", "", "c"]
+ assert pcre2.split(":+", ":a:b::c") == ["", "a", "b", "c"]
+ assert pcre2.split("(:+)", ":a:b::c") == ["", ":", "a", ":", "b", "::", "c"]
+
+ assert pcre2.split(b":", b":a:b::c") == [b"", b"a", b"b", b"", b"c"]
+ assert pcre2.split(b":+", b":a:b::c") == [b"", b"a", b"b", b"c"]
+ assert pcre2.split(b"(:+)", b":a:b::c") == [b"", b":", b"a", b":", b"b", b"::", b"c"]
+
+ for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432", "\U0001d49c\U0001d49e\U0001d4b5"):
+ string = ":%s:%s::%s" % (a, b, c)
+ assert pcre2.split(":", string) == ["", a, b, "", c]
+ assert pcre2.split(":+", string) == ["", a, b, c]
+ assert pcre2.split("(:+)", string) == ["", ":", a, ":", b, "::", c]
+
+ assert pcre2.split("(?::+)", ":a:b::c") == ["", "a", "b", "c"]
+ assert pcre2.split("([b:]+)", ":a:b::c") == ["", ":", "a", ":b::", "c"]
+ assert pcre2.split("(?:b)|(?::+)", ":a:b::c") == ["", "a", "", "", "c"]
+
+ assert pcre2.split(":", ":a:b::c", 2) == ["", "a", "b::c"]
+ assert pcre2.split(":", ":a:b::c", maxsplit=2) == ["", "a", "b::c"]
+ assert pcre2.split(":", "a:b:c:d", maxsplit=2) == ["a", "b", "c:d"]
+ assert pcre2.split("(:)", ":a:b::c", maxsplit=2) == ["", ":", "a", ":", "b::c"]
+ assert pcre2.split("(:+)", ":a:b::c", maxsplit=2) == ["", ":", "a", ":", "b::c"]
--- /dev/null
+import pcre2 as re
+import string
+import multiprocessing
+from weakref import proxy
+import pytest
+
+from tests.utils import (
+ assert_raises,
+ assert_typed_equal,
+ check_pattern_error,
+ check_template_error,
+)
+
+# This file is a modified version of the tests from CPython's regex test suite, meant to provide
+# coverage for the built-in module's behavior. However, the intention is not to cover 100% of
+# Python tests. Some functionality will remain different, such as the equality of compiled
+# patterns. The goal is to cover enough of the API to make using PCRE2 feel like using the built-in
+# module. For the tests included, you can find original versions in the link below (Python bug IDs
+# are preserved for searching):
+# https://github.com/python/cpython/blob/3.14/Lib/test/test_re.py
+
+
+class S(str):
+ def __getitem__(self, index):
+ return S(super().__getitem__(index))
+
+
+class B(bytes):
+ def __getitem__(self, index):
+ return B(super().__getitem__(index))
+
+
+def test_weakref():
+ s = "QabbbcR"
+ x = re.compile("ab+c")
+ y = proxy(x)
+ assert x.findall("QabbbcR") == y.findall("QabbbcR")
+
+
+def test_search_star_plus():
+ assert re.search("x*", "axx").span(0) == (0, 0)
+ assert re.search("x*", "axx").span() == (0, 0)
+ assert re.search("x+", "axx").span(0) == (1, 3)
+ assert re.search("x+", "axx").span() == (1, 3)
+ assert re.search("x", "aaa") is None
+ assert re.match("a*", "xxx").span(0) == (0, 0)
+ assert re.match("a*", "xxx").span() == (0, 0)
+ assert re.match("x*", "xxxa").span(0) == (0, 3)
+ assert re.match("x*", "xxxa").span() == (0, 3)
+ assert re.match("a+", "xxx") is None
+
+
+def test_branching():
+ """Test Branching
+ Test expressions using the OR ('|') operator."""
+ assert re.match("(ab|ba)", "ab").span() == (0, 2)
+ assert re.match("(ab|ba)", "ba").span() == (0, 2)
+ assert re.match("(abc|bac|ca|cb)", "abc").span() == (0, 3)
+ assert re.match("(abc|bac|ca|cb)", "bac").span() == (0, 3)
+ assert re.match("(abc|bac|ca|cb)", "ca").span() == (0, 2)
+ assert re.match("(abc|bac|ca|cb)", "cb").span() == (0, 2)
+ assert re.match("((a)|(b)|(c))", "a").span() == (0, 1)
+ assert re.match("((a)|(b)|(c))", "b").span() == (0, 1)
+ assert re.match("((a)|(b)|(c))", "c").span() == (0, 1)
+
+
+def bump_num(matchobj):
+ int_value = int(matchobj.group(0))
+ return str(int_value + 1)
+
+
+def test_basic_re_sub():
+ assert_typed_equal(re.sub("y", "a", "xyz"), "xaz")
+ assert_typed_equal(re.sub("y", S("a"), S("xyz")), "xaz")
+ assert_typed_equal(re.sub(b"y", b"a", b"xyz"), b"xaz")
+ assert_typed_equal(re.sub(b"y", B(b"a"), B(b"xyz")), b"xaz")
+ assert_typed_equal(re.sub(b"y", bytearray(b"a"), bytearray(b"xyz")), b"xaz")
+ assert_typed_equal(re.sub(b"y", memoryview(b"a"), memoryview(b"xyz")), b"xaz")
+
+ for y in ("\xe0", "\u0430", "\U0001d49c"):
+ assert re.sub(y, "a", "x%sz" % y) == "xaz"
+
+ assert re.sub("(?i)b+", "x", "bbbb BBBB") == "x x"
+ assert re.sub(r"\d+", bump_num, "08.2 -2 23x99y") == "9.3 -3 24x100y"
+
+ assert re.sub(r"\d+", bump_num, "08.2 -2 23x99y", count=3) == "9.3 -3 23x99y"
+
+ assert re.sub(".", lambda m: r"\n", "x") == "\\n"
+ assert re.sub(".", r"\n", "x") == "\n"
+
+ s = r"\g<1>\g<1>"
+ assert re.sub("(.)", s, "x") == "xx"
+ assert re.sub("(.)", s.replace("\\", r"\\"), "x") == s
+ assert re.sub("(.)", lambda m: s, "x") == s
+
+ assert re.sub("(?P<a>x)", r"\g<a>\g<a>", "xx") == "xxxx"
+ assert re.sub("(?P<a>x)", r"\g<a>\g<1>", "xx") == "xxxx"
+ assert re.sub("(?P<unk>x)", r"\g<unk>\g<unk>", "xx") == "xxxx"
+ assert re.sub("(?P<unk>x)", r"\g<1>\g<1>", "xx") == "xxxx"
+ assert re.sub("()x", r"\g<0>\g<0>", "xx") == "xxxx"
+
+ assert re.sub("a", r"\t\n\v\r\f\a\b", "a") == "\t\n\v\r\f\a\b"
+ assert re.sub("a", "\t\n\v\r\f\a\b", "a") == "\t\n\v\r\f\a\b"
+ assert re.sub("a", "\t\n\v\r\f\a\b", "a") == (
+ chr(9) + chr(10) + chr(11) + chr(13) + chr(12) + chr(7) + chr(8)
+ )
+
+ # Note that we removed the reserved characters in PCRE2 extended substitution syntax
+ for c in "cdhijkmopqswxyzABCDFGHIJKMNOPRSTVWXYZ":
+ with pytest.raises(re.LibraryError):
+ assert re.sub("a", "\\" + c, "a") == "\\" + c
+
+ assert re.sub(r"^\s*", "X", "test") == "Xtest"
+
+
+def test_bug_449964():
+ # fails for group followed by other escape
+ assert re.sub(r"(?P<unk>x)", r"\g<1>\g<1>\b", "xx") == "xx\bxx\b"
+
+
+def test_bug_449000():
+ # Test for sub() on escaped characters
+ assert re.sub(r"\r\n", r"\n", "abc\r\ndef\r\n") == "abc\ndef\n"
+ assert re.sub("\r\n", r"\n", "abc\r\ndef\r\n") == "abc\ndef\n"
+ assert re.sub(r"\r\n", "\n", "abc\r\ndef\r\n") == "abc\ndef\n"
+ assert re.sub("\r\n", "\n", "abc\r\ndef\r\n") == "abc\ndef\n"
+
+
+def test_bug_1661():
+ # Verify that flags do not get silently ignored with compiled patterns
+ pattern = re.compile(".")
+ assert_raises(ValueError, re.match, pattern, "A", re.I)
+ assert_raises(ValueError, re.search, pattern, "A", re.I)
+ assert_raises(ValueError, re.findall, pattern, "A", re.I)
+ assert_raises(ValueError, re.compile, pattern, re.I)
+
+
+def test_bug_3629():
+ # A regex that triggered a bug in the sre-code validator
+ re.compile("(?P<quote>)(?(quote))")
+
+
+def test_sub_template_numeric_escape():
+ # bug 776311 and friends
+ assert re.sub("x", r"\0", "x") == "\0"
+ assert re.sub("x", r"\000", "x") == "\000"
+ assert re.sub("x", r"\001", "x") == "\001"
+ assert re.sub("x", r"\008", "x") == "\0" + "8"
+ assert re.sub("x", r"\009", "x") == "\0" + "9"
+ assert re.sub("x", r"\111", "x") == "\111"
+ assert re.sub("x", r"\117", "x") == "\117"
+ assert re.sub("x", r"\377", "x") == "\377"
+
+ assert re.sub("x", r"\1111", "x") == "\1111"
+ assert re.sub("x", r"\1111", "x") == "\111" + "1"
+
+ assert re.sub("x", r"\00", "x") == "\x00"
+ assert re.sub("x", r"\07", "x") == "\x07"
+ assert re.sub("x", r"\08", "x") == "\0" + "8"
+ assert re.sub("x", r"\09", "x") == "\0" + "9"
+ assert re.sub("x", r"\0a", "x") == "\0" + "a"
+
+ # in python2.3 (etc), these loop endlessly in sre_parser.py
+
+ assert re.sub("(((((((((((x)))))))))))", r"\11", "x") == "x"
+ assert re.sub("((((((((((y))))))))))(.)", r"\11a", "xyz") == "xza"
+
+ # Modified for different parsing behavior in PCRE2
+ assert re.sub("((((((((((y))))))))))(.)", r"\g<11>8", "xyz") == "xz8"
+
+
+def test_qualified_re_sub():
+ assert re.sub("a", "b", "aaaaa") == "bbbbb"
+ assert re.sub("a", "b", "aaaaa", count=1) == "baaaa"
+
+ with pytest.raises(TypeError, match=r"sub\(\) got multiple values for argument 'count'"):
+ re.sub("a", "b", "aaaaa", 1, count=1)
+ with pytest.raises(TypeError, match=r"sub\(\) got multiple values for argument 'flags'"):
+ re.sub("a", "b", "aaaaa", 1, 0, flags=0)
+ with pytest.raises(
+ TypeError, match=r"sub\(\) takes from 3 to 6 positional arguments but 7 were given"
+ ):
+ re.sub("a", "b", "aaaaa", 1, 0, False, 0)
+
+
+def test_bug_114660():
+ assert re.sub(r"(\S)\s+(\S)", r"\1 \2", "hello there") == "hello there"
+
+
+def test_symbolic_groups():
+ re.compile(r"(?P<a>x)(?P=a)(?(a)y)")
+ re.compile(r"(?P<a1>x)(?P=a1)(?(a1)y)")
+ re.compile(r"(?P<a1>x)\1(?(1)y)")
+ re.compile(b"(?P<a1>x)(?P=a1)(?(a1)y)")
+ # New valid identifiers in Python 3
+ re.compile("(?P<µ>x)(?P=µ)(?(µ)y)")
+ re.compile("(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)")
+ # Support > 100 groups.
+ pat = "|".join("x(?P<a%d>%x)y" % (i, i) for i in range(1, 200 + 1))
+ pat = "(?:%s)(?(200)z|t)" % pat
+ assert re.match(pat, "xc8yz").span() == (0, 5)
+
+
+def test_symbolic_groups_errors():
+ # This test originally tested error messages, but we only test failure of compilation as
+ # messages are managed bt PCRE2
+ check_pattern_error(r"(?P<a>)(?P<a>)")
+ check_pattern_error(r"(?Pxy)")
+ check_pattern_error(r"(?P<a>)(?P=a")
+ check_pattern_error(r"(?P=")
+ check_pattern_error(r"(?P=)aaaaaaaaaaaaaaa")
+ check_pattern_error(r"(?P=1)")
+ check_pattern_error(r"(?P=a)")
+ check_pattern_error(r"(?P=a1)")
+ check_pattern_error(r"(?P=a.)")
+ check_pattern_error(r"(?P<)")
+ check_pattern_error(r"(?P<a")
+ check_pattern_error(r"(?P<")
+ check_pattern_error(r"(?P<>)")
+ check_pattern_error(r"(?P<1>)")
+ check_pattern_error(r"(?P<a.>)")
+ check_pattern_error(r"(?(")
+ check_pattern_error(r"(?())")
+ check_pattern_error(r"(?(a))")
+ check_pattern_error(r"(?(-1))")
+ check_pattern_error(r"(?(1a))")
+ check_pattern_error(r"(?(a.))")
+ check_pattern_error("(?P<©>x)")
+ check_pattern_error("(?P=©)")
+ check_pattern_error("(?(©)y)")
+ check_pattern_error(b"(?P<\xc2\xb5>x)")
+ check_pattern_error(b"(?P=\xc2\xb5)")
+ check_pattern_error(b"(?(\xc2\xb5)y)")
+
+
+def test_symbolic_refs():
+ assert re.sub("(?P<a>x)|(?P<b>y)", r"\g<b>", "xx") == ""
+ assert re.sub("(?P<a>x)|(?P<b>y)", r"\2", "xx") == ""
+ assert re.sub(b"(?P<a1>x)", rb"\g<a1>", b"xx") == b"xx"
+ # New valid identifiers in Python 3
+ assert re.sub("(?P<µ>x)", r"\g<µ>", "xx") == "xx"
+ assert re.sub("(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)", r"\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>", "xx") == "xx"
+ # Support > 100 groups.
+ pat = "|".join("x(?P<a%d>%x)y" % (i, i) for i in range(1, 200 + 1))
+ assert re.sub(pat, r"\g<200>", "xc8yzxc8y") == "c8zc8"
+
+
+def test_symbolic_refs_errors():
+ check_template_error("(?P<a>x)", r"\g<a", "xx")
+ check_template_error("(?P<a>x)", r"\g<", "xx")
+ check_template_error("(?P<a>x)", r"\g", "xx")
+ check_template_error("(?P<a>x)", r"\g<a a>", "xx")
+ check_template_error("(?P<a>x)", r"\g<>", "xx")
+ check_template_error("(?P<a>x)", r"\g<1a1>", "xx")
+ check_template_error("(?P<a>x)", r"\g<2>", "xx")
+ check_template_error("(?P<a>x)", r"\2", "xx")
+ check_template_error("(?P<a>x)", r"\g<ab>", "xx")
+ check_template_error("(?P<a>x)", r"\g<-1>", "xx")
+ check_template_error("(?P<a>x)", r"\g<+1>", "xx")
+ check_template_error("()" * 10, r"\g<1_0>", "xx")
+ check_template_error("(?P<a>x)", r"\g< 1 >", "xx")
+ check_template_error("(?P<a>x)", r"\g<©>", "xx")
+ check_template_error(b"(?P<a>x)", b"\\g<\xc2\xb5>", b"xx")
+ check_template_error("(?P<a>x)", r"\g<㊀>", "xx")
+ check_template_error("(?P<a>x)", r"\g<¹>", "xx")
+ check_template_error("(?P<a>x)", r"\g<१>", "xx")
+
+
+def test_re_subn():
+ assert re.subn("(?i)b+", "x", "bbbb BBBB") == ("x x", 2)
+ assert re.subn("b+", "x", "bbbb BBBB") == ("x BBBB", 1)
+ assert re.subn("b+", "x", "xyz") == ("xyz", 0)
+ assert re.subn("b*", "x", "xyz") == ("xxxyxzx", 4)
+ assert re.subn("b*", "x", "xyz", count=2) == ("xxxyz", 2)
+
+ with pytest.raises(TypeError):
+ re.subn("a", "b", "aaaaa", 1, count=1)
+ with pytest.raises(TypeError):
+ re.subn("a", "b", "aaaaa", 1, 0, flags=0)
+
+
+def test_re_split():
+ for string in (":a:b::c", S(":a:b::c")):
+ assert_typed_equal(re.split(":", string), ["", "a", "b", "", "c"])
+ assert_typed_equal(re.split(":+", string), ["", "a", "b", "c"])
+ assert_typed_equal(re.split("(:+)", string), ["", ":", "a", ":", "b", "::", "c"])
+ for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"), memoryview(b":a:b::c")):
+ assert_typed_equal(re.split(b":", string), [b"", b"a", b"b", b"", b"c"])
+ assert_typed_equal(re.split(b":+", string), [b"", b"a", b"b", b"c"])
+ assert_typed_equal(re.split(b"(:+)", string), [b"", b":", b"a", b":", b"b", b"::", b"c"])
+ for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432", "\U0001d49c\U0001d49e\U0001d4b5"):
+ string = ":%s:%s::%s" % (a, b, c)
+ assert re.split(":", string) == ["", a, b, "", c]
+ assert re.split(":+", string) == ["", a, b, c]
+ assert re.split("(:+)", string) == ["", ":", a, ":", b, "::", c]
+
+ assert re.split("(?::+)", ":a:b::c") == ["", "a", "b", "c"]
+ assert re.split("(:)+", ":a:b::c") == ["", ":", "a", ":", "b", ":", "c"]
+ assert re.split("([b:]+)", ":a:b::c") == ["", ":", "a", ":b::", "c"]
+ assert re.split("(b)|(:+)", ":a:b::c") == [
+ "",
+ None,
+ ":",
+ "a",
+ None,
+ ":",
+ "",
+ "b",
+ None,
+ "",
+ None,
+ "::",
+ "c",
+ ]
+ assert re.split("(?:b)|(?::+)", ":a:b::c") == ["", "a", "", "", "c"]
+
+ for sep, expected in [
+ (":*", ["", "", "a", "", "b", "", "c", ""]),
+ ("(?::*)", ["", "", "a", "", "b", "", "c", ""]),
+ ("(:*)", ["", ":", "", "", "a", ":", "", "", "b", "::", "", "", "c", "", ""]),
+ ("(:)*", ["", ":", "", None, "a", ":", "", None, "b", ":", "", None, "c", None, ""]),
+ ]:
+ assert_typed_equal(re.split(sep, ":a:b::c"), expected)
+
+ for sep, expected in [
+ ("", ["", ":", "a", ":", "b", ":", ":", "c", ""]),
+ (r"\b", [":", "a", ":", "b", "::", "c", ""]),
+ (r"(?=:)", ["", ":a", ":b", ":", ":c"]),
+ (r"(?<=:)", [":", "a:", "b:", ":", "c"]),
+ ]:
+ assert_typed_equal(re.split(sep, ":a:b::c"), expected)
+
+
+def test_qualified_re_split():
+ assert re.split(":", ":a:b::c", maxsplit=2) == ["", "a", "b::c"]
+ assert re.split(":", "a:b:c:d", maxsplit=2) == ["a", "b", "c:d"]
+ assert re.split("(:)", ":a:b::c", maxsplit=2) == ["", ":", "a", ":", "b::c"]
+ assert re.split("(:+)", ":a:b::c", maxsplit=2) == ["", ":", "a", ":", "b::c"]
+ assert re.split("(:*)", ":a:b::c", maxsplit=2) == ["", ":", "", "", "a:b::c"]
+
+ with pytest.raises(TypeError):
+ re.split(":", ":a:b::c", 2, maxsplit=2)
+ with pytest.raises(TypeError):
+ re.split(":", ":a:b::c", 2, 0, flags=0)
+
+
+def test_re_findall():
+ assert re.findall(":+", "abc") == []
+ for string in ("a:b::c:::d", S("a:b::c:::d")):
+ assert_typed_equal(re.findall(":+", string), [":", "::", ":::"])
+ assert_typed_equal(re.findall("(:+)", string), [":", "::", ":::"])
+ assert_typed_equal(re.findall("(:)(:*)", string), [(":", ""), (":", ":"), (":", "::")])
+ for string in (
+ b"a:b::c:::d",
+ B(b"a:b::c:::d"),
+ bytearray(b"a:b::c:::d"),
+ memoryview(b"a:b::c:::d"),
+ ):
+ assert_typed_equal(re.findall(b":+", string), [b":", b"::", b":::"])
+ assert_typed_equal(re.findall(b"(:+)", string), [b":", b"::", b":::"])
+ assert_typed_equal(
+ re.findall(b"(:)(:*)", string), [(b":", b""), (b":", b":"), (b":", b"::")]
+ )
+ for x in ("\xe0", "\u0430", "\U0001d49c"):
+ xx = x * 2
+ xxx = x * 3
+ string = "a%sb%sc%sd" % (x, xx, xxx)
+ assert re.findall("%s+" % x, string) == [x, xx, xxx]
+ assert re.findall("(%s+)" % x, string) == [x, xx, xxx]
+ assert re.findall("(%s)(%s*)" % (x, x), string), [(x, ""), (x, x) == (x, xx)]
+
+
+def test_bug_117612():
+ assert re.findall(r"(a|(b))", "aba"), [("a", ""), ("b", "b") == ("a", "")]
+
+
+def test_re_match():
+ for string in ("a", S("a")):
+ assert re.match("a", string).groups() == ()
+ assert re.match("(a)", string).groups() == ("a",)
+ assert re.match("(a)", string).group(0) == "a"
+ assert re.match("(a)", string).group(1) == "a"
+ assert re.match("(a)", string).group(1, 1) == ("a", "a")
+ for string in (b"a", B(b"a"), bytearray(b"a"), memoryview(b"a")):
+ assert re.match(b"a", string).groups() == ()
+ assert re.match(b"(a)", string).groups() == (b"a",)
+ assert re.match(b"(a)", string).group(0) == b"a"
+ assert re.match(b"(a)", string).group(1) == b"a"
+ assert re.match(b"(a)", string).group(1, 1) == (b"a", b"a")
+ for a in ("\xe0", "\u0430", "\U0001d49c"):
+ assert re.match(a, a).groups() == ()
+ assert re.match("(%s)" % a, a).groups() == (a,)
+ assert re.match("(%s)" % a, a).group(0) == a
+ assert re.match("(%s)" % a, a).group(1) == a
+ assert re.match("(%s)" % a, a).group(1, 1) == (a, a)
+
+ pat = re.compile("((a)|(b))(c)?")
+ assert pat.match("a").groups() == ("a", "a", None, None)
+ assert pat.match("b").groups() == ("b", None, "b", None)
+ assert pat.match("ac").groups() == ("a", "a", None, "c")
+ assert pat.match("bc").groups() == ("b", None, "b", "c")
+ assert pat.match("bc").groups("") == ("b", "", "b", "c")
+
+ pat = re.compile("(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?")
+ assert pat.match("a").group(1, 2, 3) == ("a", None, None)
+ assert pat.match("b").group("a1", "b2", "c3") == (None, "b", None)
+ assert pat.match("ac").group(1, "b2", 3) == ("a", None, "c")
+
+
+def test_group():
+ class Index:
+ def __init__(self, value):
+ self.value = value
+
+ def __index__(self):
+ return self.value
+
+ # A single group
+ m = re.match("(a)(b)", "ab")
+ assert m.group() == "ab"
+ assert m.group(0) == "ab"
+ assert m.group(1) == "a"
+ assert m.group(Index(1)) == "a"
+ assert_raises(IndexError, m.group, -1)
+ assert_raises(IndexError, m.group, 3)
+ assert_raises(IndexError, m.group, 1 << 1000)
+
+ # Unclear why the below fails
+ # assert_raises(IndexError, m.group, Index(1 << 1000))
+
+ assert_raises(IndexError, m.group, "x")
+ # Multiple groups
+ assert m.group(2, 1) == ("b", "a")
+ assert m.group(Index(2), Index(1)) == ("b", "a")
+
+
+def test_match_getitem():
+ pat = re.compile("(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?")
+
+ m = pat.match("a")
+ assert m["a1"] == "a"
+ assert m["b2"] == None
+ assert m["c3"] == None
+ assert "a1={a1} b2={b2} c3={c3}".format_map(m) == "a1=a b2=None c3=None"
+ assert m[0] == "a"
+ assert m[1] == "a"
+ assert m[2] == None
+ assert m[3] == None
+ with pytest.raises(IndexError):
+ m["X"]
+ with pytest.raises(IndexError):
+ m[-1]
+ with pytest.raises(IndexError):
+ m[4]
+ with pytest.raises(IndexError):
+ m[0, 1]
+ with pytest.raises(IndexError):
+ m[(0,)]
+ with pytest.raises(IndexError):
+ m[(0, 1)]
+ with pytest.raises(IndexError):
+ "a1={a2}".format_map(m)
+
+ m = pat.match("ac")
+ assert m["a1"] == "a"
+ assert m["b2"] == None
+ assert m["c3"] == "c"
+ assert "a1={a1} b2={b2} c3={c3}".format_map(m) == "a1=a b2=None c3=c"
+ assert m[0] == "ac"
+ assert m[1] == "a"
+ assert m[2] == None
+ assert m[3] == "c"
+
+ # Cannot assign.
+ with pytest.raises(TypeError):
+ m[0] = 1
+
+ # No len().
+ assert_raises(TypeError, len, m)
+
+
+def test_re_fullmatch():
+ # Issue 16203: Proposal: add re.fullmatch() method.
+ assert re.fullmatch(r"a", "a").span() == (0, 1)
+ for string in "ab", S("ab"):
+ assert re.fullmatch(r"a|ab", string).span() == (0, 2)
+ for string in (b"ab", B(b"ab"), bytearray(b"ab"), memoryview(b"ab")):
+ assert re.fullmatch(rb"a|ab", string).span() == (0, 2)
+ for a, b in "\xe0\xdf", "\u0430\u0431", "\U0001d49c\U0001d49e":
+ r = r"%s|%s" % (a, a + b)
+ assert re.fullmatch(r, a + b).span() == (0, 2)
+ assert re.fullmatch(r".*?$", "abc").span() == (0, 3)
+ assert re.fullmatch(r".*?", "abc").span() == (0, 3)
+ assert re.fullmatch(r"a.*?b", "ab").span() == (0, 2)
+ assert re.fullmatch(r"a.*?b", "abb").span() == (0, 3)
+ assert re.fullmatch(r"a.*?b", "axxb").span() == (0, 4)
+ assert re.fullmatch(r"a+", "ab") is None
+ assert re.fullmatch(r"abc$", "abc\n") is None
+ assert re.fullmatch(r"abc\z", "abc\n") is None
+ assert re.fullmatch(r"abc\Z", "abc\n") is None
+ assert re.fullmatch(r"(?m)abc$", "abc\n") is None
+ assert re.fullmatch(r"ab(?=c)cd", "abcd").span() == (0, 4)
+ assert re.fullmatch(r"ab(?<=b)cd", "abcd").span() == (0, 4)
+ assert re.fullmatch(r"(?=a|ab)ab", "ab").span() == (0, 2)
+
+ assert re.compile(r"bc").fullmatch("abcd", pos=1, endpos=3).span() == (1, 3)
+ assert re.compile(r".*?$").fullmatch("abcd", pos=1, endpos=3).span() == (1, 3)
+ assert re.compile(r".*?").fullmatch("abcd", pos=1, endpos=3).span() == (1, 3)
+
+
+def test_re_groupref_exists():
+ assert re.match(r"^(\()?([^()]+)(?(1)\))$", "(a)").groups() == ("(", "a")
+ assert re.match(r"^(\()?([^()]+)(?(1)\))$", "a").groups() == (None, "a")
+ assert re.match(r"^(\()?([^()]+)(?(1)\))$", "a)") is None
+ assert re.match(r"^(\()?([^()]+)(?(1)\))$", "(a") is None
+ assert re.match("^(?:(a)|c)((?(1)b|d))$", "ab").groups() == ("a", "b")
+ assert re.match(r"^(?:(a)|c)((?(1)b|d))$", "cd").groups() == (None, "d")
+ assert re.match(r"^(?:(a)|c)((?(1)|d))$", "cd").groups() == (None, "d")
+ assert re.match(r"^(?:(a)|c)((?(1)|d))$", "a").groups() == ("a", "")
+
+ # Tests for bug #1177831: exercise groups other than the first group
+ p = re.compile("(?P<g1>a)(?P<g2>b)?((?(g2)c|d))")
+ assert p.match("abc").groups() == ("a", "b", "c")
+ assert p.match("ad").groups() == ("a", None, "d")
+ assert p.match("abd") is None
+ assert p.match("ac") is None
+
+ # Support > 100 groups.
+ pat = "|".join("x(?P<a%d>%x)y" % (i, i) for i in range(1, 200 + 1))
+ pat = "(?:%s)(?(200)z)" % pat
+ assert re.match(pat, "xc8yz").span() == (0, 5)
+
+
+def test_re_groupref_exists_errors():
+ check_pattern_error(r"(?P<a>)(?(0)a|b)")
+ check_pattern_error(r"()(?(+1)a|b)")
+ check_pattern_error(r"()" * 10 + r"(?(1_0)a|b)")
+ check_pattern_error(r"()(?( 1 )a|b)")
+ check_pattern_error(r"()(?(㊀)a|b)")
+ check_pattern_error(r"()(?(¹)a|b)")
+ check_pattern_error(r"()(?(१)a|b)")
+ check_pattern_error(r"()(?(1")
+ check_pattern_error(r"()(?(1)a")
+ check_pattern_error(r"()(?(1)a|b")
+ check_pattern_error(r"()(?(1)a|b|c")
+ check_pattern_error(r"()(?(1)a|b|c)")
+ check_pattern_error(r"()(?(2)a)")
+
+
+def test_re_groupref_exists_validation_bug():
+ for i in range(256):
+ re.compile(r"()(?(1)\x%02x?)" % i)
+
+
+def test_re_groupref():
+ assert re.match(r"^(\|)?([^()]+)\1$", "|a|").groups() == ("|", "a")
+ assert re.match(r"^(\|)?([^()]+)\1?$", "a").groups() == (None, "a")
+ assert re.match(r"^(\|)?([^()]+)\1$", "a|") is None
+ assert re.match(r"^(\|)?([^()]+)\1$", "|a") is None
+ assert re.match(r"^(?:(a)|c)(\1)$", "aa").groups() == ("a", "a")
+ assert re.match(r"^(?:(a)|c)(\1)?$", "c").groups() == (None, None)
+
+
+def test_groupdict():
+ assert re.match("(?P<first>first) (?P<second>second)", "first second").groupdict() == {
+ "first": "first",
+ "second": "second",
+ }
+
+
+def test_expand():
+ assert (
+ re.match("(?P<first>first) (?P<second>second)", "first second").expand(
+ r"\2 \1 \g<second> \g<first>"
+ )
+ == "second first second first"
+ )
+ assert re.match("(?P<first>first)|(?P<second>second)", "first").expand(r"\2 \g<second>") == " "
+
+
+def test_repeat_minmax():
+ assert re.match(r"^(\w){1}$", "abc") is None
+ assert re.match(r"^(\w){1}?$", "abc") is None
+ assert re.match(r"^(\w){1,2}$", "abc") is None
+ assert re.match(r"^(\w){1,2}?$", "abc") is None
+
+ assert re.match(r"^(\w){3}$", "abc").group(1) == "c"
+ assert re.match(r"^(\w){1,3}$", "abc").group(1) == "c"
+ assert re.match(r"^(\w){1,4}$", "abc").group(1) == "c"
+ assert re.match(r"^(\w){3,4}?$", "abc").group(1) == "c"
+ assert re.match(r"^(\w){3}?$", "abc").group(1) == "c"
+ assert re.match(r"^(\w){1,3}?$", "abc").group(1) == "c"
+ assert re.match(r"^(\w){1,4}?$", "abc").group(1) == "c"
+ assert re.match(r"^(\w){3,4}?$", "abc").group(1) == "c"
+
+ assert re.match(r"^x{1}$", "xxx") is None
+ assert re.match(r"^x{1}?$", "xxx") is None
+ assert re.match(r"^x{1,2}$", "xxx") is None
+ assert re.match(r"^x{1,2}?$", "xxx") is None
+
+ assert re.match(r"^x{3}$", "xxx")
+ assert re.match(r"^x{1,3}$", "xxx")
+ assert re.match(r"^x{3,3}$", "xxx")
+ assert re.match(r"^x{1,4}$", "xxx")
+ assert re.match(r"^x{3,4}?$", "xxx")
+ assert re.match(r"^x{3}?$", "xxx")
+ assert re.match(r"^x{1,3}?$", "xxx")
+ assert re.match(r"^x{1,4}?$", "xxx")
+ assert re.match(r"^x{3,4}?$", "xxx")
+
+ assert re.match(r"^x{}$", "xxx") is None
+ assert re.match(r"^x{}$", "x{}")
+
+ check_pattern_error(r"x{2,1}")
+
+
+def test_getattr():
+ assert re.compile("(?i)(a)(b)").pattern == "(?i)(a)(b)"
+ # assert re.compile("(?i)(a)(b)").flags == re.I | re.U # TODO: Look into why not
+ assert re.compile("(?i)(a)(b)").groups == 2
+ assert re.compile("(?i)(a)(b)").groupindex == {}
+ assert re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex == {"first": 1, "other": 2}
+
+ assert re.match("(a)", "a").pos == 0
+ assert re.match("(a)", "a").endpos == 1
+ assert re.match("(a)", "a").string == "a"
+ assert re.match("(a)", "a").re
+
+ # Issue 14260. groupindex should be non-modifiable mapping.
+ p = re.compile(r"(?i)(?P<first>a)(?P<other>b)")
+ assert sorted(p.groupindex) == ["first", "other"]
+ assert p.groupindex["other"] == 2
+
+ with pytest.raises(TypeError):
+ p.groupindex["other"] = 0
+
+ assert p.groupindex["other"] == 2
+
+
+def test_special_escapes():
+ assert re.search(r"\b(b.)\b", "abcd abc bcd bx").group(1) == "bx"
+ assert re.search(r"\B(b.)\B", "abc bcd bc abxd").group(1) == "bx"
+
+ # TODO: Add ASCII
+ assert re.search(r"\b(b.)\b", "abcd abc bcd bx", re.ASCII).group(1) == "bx"
+ assert re.search(r"\B(b.)\B", "abc bcd bc abxd", re.ASCII).group(1) == "bx"
+
+ assert re.search(r"^abc$", "\nabc\n", re.M).group(0) == "abc"
+ assert re.search(r"^\Aabc\z$", "abc", re.M).group(0) == "abc"
+ assert re.search(r"^\Aabc\z$", "\nabc\n", re.M) is None
+ assert re.search(r"^\Aabc\Z$", "abc", re.M).group(0) == "abc"
+ assert re.search(r"^\Aabc\Z$", "\nabc\n", re.M) is None
+ assert re.search(rb"\b(b.)\b", b"abcd abc bcd bx").group(1) == b"bx"
+ assert re.search(rb"\B(b.)\B", b"abc bcd bc abxd").group(1) == b"bx"
+ assert re.search(rb"^abc$", b"\nabc\n", re.M).group(0) == b"abc"
+ assert re.search(rb"^\Aabc\z$", b"abc", re.M).group(0) == b"abc"
+ assert re.search(rb"^\Aabc\z$", b"\nabc\n", re.M) is None
+ assert re.search(rb"^\Aabc\Z$", b"abc", re.M).group(0) == b"abc"
+ assert re.search(rb"^\Aabc\Z$", b"\nabc\n", re.M) is None
+ assert re.search(r"\d\D\w\W\s\S", "1aa! a").group(0) == "1aa! a"
+ assert re.search(rb"\d\D\w\W\s\S", b"1aa! a").group(0) == b"1aa! a"
+ assert re.search(r"\d\D\w\W\s\S", "1aa! a", re.ASCII).group(0) == "1aa! a"
+
+
+def test_other_escapes():
+ check_pattern_error("\\")
+
+ assert re.match(r"\(", "(").group() == "("
+ assert re.match(r"\(", ")") is None
+ assert re.match(r"\\", "\\").group() == "\\"
+ assert re.match(r"[\]]", "]").group() == "]"
+ assert re.match(r"[\]]", "[") is None
+ assert re.match(r"[a\-c]", "-").group() == "-"
+ assert re.match(r"[a\-c]", "b") is None
+ assert re.match(r"[\^a]+", "a^").group() == "a^"
+ assert re.match(r"[\^a]+", "b") is None
+
+ for c in "cijlmopqyCFIJLMOPTY":
+ check_pattern_error("\\%c" % c)
+ for c in "cijlmopqyzABCFIJLMOPTYZ":
+ check_pattern_error("[\\%c]" % c)
+
+
+def test_word_boundaries():
+ # See http://bugs.python.org/issue10713
+ assert re.search(r"\b(abc)\b", "abc").group(1) == "abc"
+ assert re.search(r"\b(abc)\b", "abc", re.ASCII).group(1) == "abc"
+ assert re.search(rb"\b(abc)\b", b"abc").group(1) == b"abc"
+ assert re.search(r"\b(ьюя)\b", "ьюя").group(1) == "ьюя"
+ assert re.search(r"\b(ьюя)\b", "ьюя", re.ASCII) is None
+ # There's a word boundary between a word and a non-word.
+ assert re.match(r".\b", "a=")
+ assert re.match(r".\b", "a=", re.ASCII)
+ assert re.match(rb".\b", b"a=")
+ assert re.match(r".\b", "я=")
+ assert re.match(r".\b", "я=", re.ASCII) is None
+ # There's a word boundary between a non-word and a word.
+ assert re.match(r".\b", "=a")
+ assert re.match(r".\b", "=a", re.ASCII)
+ assert re.match(rb".\b", b"=a")
+ assert re.match(r".\b", "=я")
+ assert re.match(r".\b", "=я", re.ASCII) is None
+ # There is no word boundary inside a word.
+ assert re.match(r".\b", "ab") is None
+ assert re.match(r".\b", "ab", re.ASCII) is None
+ assert re.match(rb".\b", b"ab") is None
+ assert re.match(r".\b", "юя") is None
+ assert re.match(r".\b", "юя", re.ASCII) is None
+ # There is no word boundary between a non-word characters.
+ assert re.match(r".\b", "=-") is None
+ assert re.match(r".\b", "=-", re.ASCII) is None
+ assert re.match(rb".\b", b"=-") is None
+ # There is no non-boundary match between a word and a non-word.
+ assert re.match(r".\B", "a=") is None
+ assert re.match(r".\B", "a=", re.ASCII) is None
+ assert re.match(rb".\B", b"a=") is None
+ assert re.match(r".\B", "я=") is None
+ assert re.match(r".\B", "я=", re.ASCII)
+ # There is no non-boundary match between a non-word and a word.
+ assert re.match(r".\B", "=a") is None
+ assert re.match(r".\B", "=a", re.ASCII) is None
+ assert re.match(rb".\B", b"=a") is None
+ assert re.match(r".\B", "=я") is None
+ assert re.match(r".\B", "=я", re.ASCII)
+ # There's a non-boundary match inside a word.
+ assert re.match(r".\B", "ab")
+ assert re.match(r".\B", "ab", re.ASCII)
+ assert re.match(rb".\B", b"ab")
+ assert re.match(r".\B", "юя")
+ assert re.match(r".\B", "юя", re.ASCII)
+ # There's a non-boundary match between a non-word characters.
+ assert re.match(r".\B", "=-")
+ assert re.match(r".\B", "=-", re.ASCII)
+ assert re.match(rb".\B", b"=-")
+ # There's a word boundary at the start of a string.
+ assert re.match(r"\b", "abc")
+ assert re.match(r"\b", "abc", re.ASCII)
+ assert re.match(rb"\b", b"abc")
+ assert re.match(r"\b", "ьюя")
+ assert re.match(r"\b", "ьюя", re.ASCII) is None
+ # There's a word boundary at the end of a string.
+ assert re.fullmatch(r".+\b", "abc")
+ assert re.fullmatch(r".+\b", "abc", re.ASCII)
+ assert re.fullmatch(rb".+\b", b"abc")
+ assert re.fullmatch(r".+\b", "ьюя")
+ assert re.search(r"\b", "ьюя", re.ASCII) is None
+ # A non-empty string includes a non-boundary zero-length match.
+ assert re.search(r"\B", "abc").span() == (1, 1)
+ assert re.search(r"\B", "abc", re.ASCII).span() == (1, 1)
+ assert re.search(rb"\B", b"abc").span() == (1, 1)
+ assert re.search(r"\B", "ьюя").span() == (1, 1)
+ assert re.search(r"\B", "ьюя", re.ASCII).span() == (0, 0)
+ # There is no non-boundary match at the start of a string.
+ assert re.match(r"\B", "abc") is None
+ assert re.match(r"\B", "abc", re.ASCII) is None
+ assert re.match(rb"\B", b"abc") is None
+ assert re.match(r"\B", "ьюя") is None
+ assert re.match(r"\B", "ьюя", re.ASCII)
+ # There is no non-boundary match at the end of a string.
+ assert re.fullmatch(r".+\B", "abc") is None
+ assert re.fullmatch(r".+\B", "abc", re.ASCII) is None
+ assert re.fullmatch(rb".+\B", b"abc") is None
+ assert re.fullmatch(r".+\B", "ьюя") is None
+ assert re.fullmatch(r".+\B", "ьюя", re.ASCII)
+ # However, an empty string contains no word boundaries.
+ assert re.search(r"\b", "") is None
+ assert re.search(r"\b", "", re.ASCII) is None
+ assert re.search(rb"\b", b"") is None
+ assert re.search(r"\B", "")
+ assert re.search(r"\B", "", re.ASCII)
+ assert re.search(rb"\B", b"")
+ # A single word-character string has two boundaries, but no
+ # non-boundary gaps.
+ assert len(re.findall(r"\b", "a")) == 2
+ assert len(re.findall(r"\b", "a", re.ASCII)) == 2
+ assert len(re.findall(rb"\b", b"a")) == 2
+ assert len(re.findall(r"\B", "a")) == 0
+ assert len(re.findall(r"\B", "a", re.ASCII)) == 0
+ assert len(re.findall(rb"\B", b"a")) == 0
+ # If there are no words, there are no boundaries
+ assert len(re.findall(r"\b", " ")) == 0
+ assert len(re.findall(r"\b", " ", re.ASCII)) == 0
+ assert len(re.findall(rb"\b", b" ")) == 0
+ assert len(re.findall(r"\b", " ")) == 0
+ assert len(re.findall(r"\b", " ", re.ASCII)) == 0
+ assert len(re.findall(rb"\b", b" ")) == 0
+ # Can match around the whitespace.
+ assert len(re.findall(r"\B", " ")) == 2
+ assert len(re.findall(r"\B", " ", re.ASCII)) == 2
+ assert len(re.findall(rb"\B", b" ")) == 2
+
+
+def test_bigcharset():
+ assert re.match("([\u2222\u2223])", "\u2222").group(1) == "\u2222"
+
+
+def test_big_codesize():
+ # Issue #1160
+ r = re.compile("|".join(("%d" % x for x in range(5000))))
+ assert r.match("1000")
+ assert r.match("9999")
+
+
+def test_anyall():
+ assert re.match("a.b", "a\nb", re.DOTALL).group(0) == "a\nb"
+ assert re.match("a.*b", "a\n\nb", re.DOTALL).group(0) == "a\n\nb"
+
+
+def test_lookahead():
+ assert re.match(r"(a(?=\s[^a]))", "a b").group(1) == "a"
+ assert re.match(r"(a(?=\s[^a]*))", "a b").group(1) == "a"
+ assert re.match(r"(a(?=\s[abc]))", "a b").group(1) == "a"
+ assert re.match(r"(a(?=\s[abc]*))", "a bc").group(1) == "a"
+ assert re.match(r"(a)(?=\s\1)", "a a").group(1) == "a"
+ assert re.match(r"(a)(?=\s\1*)", "a aa").group(1) == "a"
+ assert re.match(r"(a)(?=\s(abc|a))", "a a").group(1) == "a"
+
+ assert re.match(r"(a(?!\s[^a]))", "a a").group(1) == "a"
+ assert re.match(r"(a(?!\s[abc]))", "a d").group(1) == "a"
+ assert re.match(r"(a)(?!\s\1)", "a b").group(1) == "a"
+ assert re.match(r"(a)(?!\s(abc|a))", "a b").group(1) == "a"
+
+ # Group reference.
+ assert re.match(r"(a)b(?=\1)a", "aba")
+ assert re.match(r"(a)b(?=\1)c", "abac") is None
+ # Conditional group reference.
+ assert re.match(r"(?:(a)|(x))b(?=(?(2)x|c))c", "abc")
+ assert re.match(r"(?:(a)|(x))b(?=(?(2)c|x))c", "abc") is None
+ assert re.match(r"(?:(a)|(x))b(?=(?(2)x|c))c", "abc")
+ assert re.match(r"(?:(a)|(x))b(?=(?(1)b|x))c", "abc") is None
+ assert re.match(r"(?:(a)|(x))b(?=(?(1)c|x))c", "abc")
+ # Group used before defined.
+ assert re.match(r"(a)b(?=(?(2)x|c))(c)", "abc")
+ assert re.match(r"(a)b(?=(?(2)b|x))(c)", "abc") is None
+ assert re.match(r"(a)b(?=(?(1)c|x))(c)", "abc")
+
+
+def test_lookbehind():
+ assert re.match(r"ab(?<=b)c", "abc")
+ assert re.match(r"ab(?<=c)c", "abc") is None
+ assert re.match(r"ab(?<!b)c", "abc") is None
+ assert re.match(r"ab(?<!c)c", "abc")
+ # Group reference.
+ assert re.match(r"(a)a(?<=\1)c", "aac")
+ assert re.match(r"(a)b(?<=\1)a", "abaa") is None
+ assert re.match(r"(a)a(?<!\1)c", "aac") is None
+ assert re.match(r"(a)b(?<!\1)a", "abaa")
+ # Conditional group reference.
+ assert re.match(r"(?:(a)|(x))b(?<=(?(2)x|c))c", "abc") is None
+ assert re.match(r"(?:(a)|(x))b(?<=(?(2)b|x))c", "abc") is None
+ assert re.match(r"(?:(a)|(x))b(?<=(?(2)x|b))c", "abc")
+ assert re.match(r"(?:(a)|(x))b(?<=(?(1)c|x))c", "abc") is None
+ assert re.match(r"(?:(a)|(x))b(?<=(?(1)b|x))c", "abc")
+ # Group used before defined.
+ assert re.match(r"(a)b(?<=(?(1)c|x))(c)", "abc") is None
+ assert re.match(r"(a)b(?<=(?(1)b|x))(c)", "abc")
+
+
+def test_ignore_case():
+ assert re.match("abc", "ABC", re.I).group(0) == "ABC"
+ assert re.match(b"abc", b"ABC", re.I).group(0) == b"ABC"
+ assert re.match(r"(a\s[^a])", "a b", re.I).group(1) == "a b"
+ assert re.match(r"(a\s[^a]*)", "a bb", re.I).group(1) == "a bb"
+ assert re.match(r"(a\s[abc])", "a b", re.I).group(1) == "a b"
+ assert re.match(r"(a\s[abc]*)", "a bb", re.I).group(1) == "a bb"
+ assert re.match(r"((a)\s\2)", "a a", re.I).group(1) == "a a"
+ assert re.match(r"((a)\s\2*)", "a aa", re.I).group(1) == "a aa"
+ assert re.match(r"((a)\s(abc|a))", "a a", re.I).group(1) == "a a"
+ assert re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1) == "a aa"
+
+ # Two different characters have the same lowercase.
+ assert "K".lower() == "\u212a".lower() == "k" # 'K'
+ assert re.match(r"K", "\u212a", re.I)
+ assert re.match(r"k", "\u212a", re.I)
+ assert re.match(r"\N{U+212a}", "K", re.I)
+ assert re.match(r"\N{U+212a}", "k", re.I)
+
+ # Two different characters have the same uppercase.
+ assert "s".upper() == "\u017f".upper() == "S" # 'ſ'
+ assert re.match(r"S", "\u017f", re.I)
+ assert re.match(r"s", "\u017f", re.I)
+ assert re.match(r"\u017f", "S", re.I)
+ assert re.match(r"\u017f", "s", re.I)
+
+ # Two different characters have the same uppercase. Unicode 9.0+.
+ assert "\u0432".upper() == "\u1c80".upper() == "\u0412" # 'в', 'ᲀ', 'В'
+ assert re.match(r"\u0412", "\u0432", re.I)
+ assert re.match(r"\u0412", "\u1c80", re.I)
+ assert re.match(r"\u0432", "\u0412", re.I)
+ assert re.match(r"\u0432", "\u1c80", re.I)
+ assert re.match(r"\u1c80", "\u0412", re.I)
+ assert re.match(r"\u1c80", "\u0432", re.I)
+
+ # Two different characters have the same multicharacter uppercase.
+ assert "\ufb05".upper() == "\ufb06".upper() == "ST" # 'ſt', 'st'
+ assert re.match(r"\ufb05", "\ufb06", re.I)
+ assert re.match(r"\ufb06", "\ufb05", re.I)
+
+
+def test_ignore_case_set():
+ assert re.match(r"[19A]", "A", re.I)
+ assert re.match(r"[19a]", "a", re.I)
+ assert re.match(r"[19a]", "A", re.I)
+ assert re.match(r"[19A]", "a", re.I)
+ assert re.match(rb"[19A]", b"A", re.I)
+ assert re.match(rb"[19a]", b"a", re.I)
+ assert re.match(rb"[19a]", b"A", re.I)
+ assert re.match(rb"[19A]", b"a", re.I)
+ assert re.match(r"[19\xc7]", "\xc7", re.I)
+ assert re.match(r"[19\xc7]", "\xe7", re.I)
+ assert re.match(r"[19\xe7]", "\xc7", re.I)
+ assert re.match(r"[19\xe7]", "\xe7", re.I)
+ assert re.match(r"[19\u0400]", "\u0400", re.I)
+ assert re.match(r"[19\u0400]", "\u0450", re.I)
+ assert re.match(r"[19\u0450]", "\u0400", re.I)
+ assert re.match(r"[19\u0450]", "\u0450", re.I)
+
+ assert re.match(rb"[19A]", b"A", re.I)
+ assert re.match(rb"[19a]", b"a", re.I)
+ assert re.match(rb"[19a]", b"A", re.I)
+ assert re.match(rb"[19A]", b"a", re.I)
+
+ # Two different characters have the same lowercase.
+ assert "K".lower() == "\u212a".lower() == "k" # 'K'
+ assert re.match(r"[19K]", "\u212a", re.I)
+ assert re.match(r"[19k]", "\u212a", re.I)
+ assert re.match(r"[19\u212a]", "K", re.I)
+ assert re.match(r"[19\u212a]", "k", re.I)
+
+ # Two different characters have the same uppercase.
+ assert "s".upper() == "\u017f".upper() == "S" # 'ſ'
+ assert re.match(r"[19S]", "\u017f", re.I)
+ assert re.match(r"[19s]", "\u017f", re.I)
+ assert re.match(r"[19\u017f]", "S", re.I)
+ assert re.match(r"[19\u017f]", "s", re.I)
+
+ # Two different characters have the same uppercase. Unicode 9.0+.
+ assert "\u0432".upper() == "\u1c80".upper() == "\u0412" # 'в', 'ᲀ', 'В'
+ assert re.match(r"[19\u0412]", "\u0432", re.I)
+ assert re.match(r"[19\u0412]", "\u1c80", re.I)
+ assert re.match(r"[19\u0432]", "\u0412", re.I)
+ assert re.match(r"[19\u0432]", "\u1c80", re.I)
+ assert re.match(r"[19\u1c80]", "\u0412", re.I)
+ assert re.match(r"[19\u1c80]", "\u0432", re.I)
+
+ # Two different characters have the same multicharacter uppercase.
+ assert "\ufb05".upper() == "\ufb06".upper() == "ST" # 'ſt', 'st'
+ assert re.match(r"[19\ufb05]", "\ufb06", re.I)
+ assert re.match(r"[19\ufb06]", "\ufb05", re.I)
+
+
+def test_ignore_case_range():
+ # Issues #3511, #17381.
+ assert re.match(r"[9-a]", "_", re.I)
+ assert re.match(r"[9-A]", "_", re.I) is None
+ assert re.match(rb"[9-a]", b"_", re.I)
+ assert re.match(rb"[9-A]", b"_", re.I) is None
+ assert re.match(r"[\xc0-\xde]", "\xd7", re.I)
+ assert re.match(r"[\xc0-\xde]", "\xe7", re.I)
+ assert re.match(r"[\xc0-\xde]", "\xf7", re.I) is None
+ assert re.match(r"[\xe0-\xfe]", "\xf7", re.I)
+ assert re.match(r"[\xe0-\xfe]", "\xc7", re.I)
+ assert re.match(r"[\xe0-\xfe]", "\xd7", re.I) is None
+ assert re.match(r"[\u0430-\u045f]", "\u0450", re.I)
+ assert re.match(r"[\u0430-\u045f]", "\u0400", re.I)
+ assert re.match(r"[\u0400-\u042f]", "\u0450", re.I)
+ assert re.match(r"[\u0400-\u042f]", "\u0400", re.I)
+
+ assert re.match(r"[N-\x7f]", "A", re.I | re.A)
+ assert re.match(r"[n-\x7f]", "Z", re.I | re.A)
+ assert re.match(r"[N-\uffff]", "A", re.I | re.A)
+ assert re.match(r"[n-\uffff]", "Z", re.I | re.A)
+
+ # Two different characters have the same lowercase.
+ assert "K".lower() == "\u212a".lower() == "k" # 'K'
+ assert re.match(r"[J-M]", "\u212a", re.I)
+ assert re.match(r"[j-m]", "\u212a", re.I)
+ assert re.match(r"[\u2129-\u212b]", "K", re.I)
+ assert re.match(r"[\u2129-\u212b]", "k", re.I)
+
+ # Two different characters have the same uppercase.
+ assert "s".upper() == "\u017f".upper() == "S" # 'ſ'
+ assert re.match(r"[R-T]", "\u017f", re.I)
+ assert re.match(r"[r-t]", "\u017f", re.I)
+ assert re.match(r"[\u017e-\u0180]", "S", re.I)
+ assert re.match(r"[\u017e-\u0180]", "s", re.I)
+
+ # Two different characters have the same uppercase. Unicode 9.0+.
+ assert "\u0432".upper() == "\u1c80".upper() == "\u0412" # 'в', 'ᲀ', 'В'
+ assert re.match(r"[\u0411-\u0413]", "\u0432", re.I)
+ assert re.match(r"[\u0411-\u0413]", "\u1c80", re.I)
+ assert re.match(r"[\u0431-\u0433]", "\u0412", re.I)
+ assert re.match(r"[\u0431-\u0433]", "\u1c80", re.I)
+ assert re.match(r"[\u1c80-\u1c82]", "\u0412", re.I)
+ assert re.match(r"[\u1c80-\u1c82]", "\u0432", re.I)
+
+ # Two different characters have the same multicharacter uppercase.
+ assert "\ufb05".upper() == "\ufb06".upper() == "ST" # 'ſt', 'st'
+ assert re.match(r"[\ufb04-\ufb05]", "\ufb06", re.I)
+ assert re.match(r"[\ufb06-\ufb07]", "\ufb05", re.I)
+
+
+def test_category():
+ assert re.match(r"(\s)", " ").group(1) == " "
+
+
+def test_not_literal():
+ assert re.search(r"\s([^a])", " b").group(1) == "b"
+ assert re.search(r"\s([^a]*)", " bb").group(1) == "bb"
+
+
+def test_possible_set_operations():
+ s = bytes(range(128)).decode()
+ assert re.findall(r"[0-9--1]", s) == list("-./0123456789")
+ assert re.findall(r"[0-9--2]", s) == list("-./0123456789")
+ assert re.findall(r"[--1]", s) == list("-./01")
+ assert re.findall(r"[%--1]", s) == list("%&'()*+,-1")
+ assert re.findall(r"[%--]", s) == list("%&'()*+,-")
+ assert re.findall(r"[0-9&&1]", s) == list("&0123456789")
+ assert re.findall(r"[0-8&&1]", s) == list("&012345678")
+ assert re.findall(r"[\d&&1]", s) == list("&0123456789")
+ assert re.findall(r"[&&1]", s) == list("&1")
+ assert re.findall(r"[0-9||a]", s) == list("0123456789a|")
+ assert re.findall(r"[\d||a]", s) == list("0123456789a|")
+ assert re.findall(r"[||1]", s) == list("1|")
+ assert re.findall(r"[0-9~~1]", s) == list("0123456789~")
+ assert re.findall(r"[\d~~1]", s) == list("0123456789~")
+ assert re.findall(r"[~~1]", s) == list("1~")
+ assert re.findall(r"[[0-9]|]", s) == list("0123456789[]")
+ assert re.findall(r"[[0-8]|]", s) == list("012345678[]")
+ assert re.findall(r"[[:digit:]|]", s) == list("0123456789|")
+
+
+def test_search_coverage():
+ assert re.search(r"\s(b)", " b").group(1) == "b"
+ assert re.search(r"a\s", "a ").group(0) == "a "
+
+
+def test_pickling():
+ import pickle
+
+ oldpat = re.compile("a(?:b|(c|e){1,2}?|d)+?(.)", re.UNICODE)
+ for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+ pickled = pickle.dumps(oldpat, proto)
+ newpat = pickle.loads(pickled)
+ assert newpat.pattern == oldpat.pattern
+ # current pickle expects the _compile() reconstructor in re module
+ from re import _compile # noqa: F401
+
+
+def test_constants():
+ assert re.I == re.IGNORECASE
+ assert re.M == re.MULTILINE
+ assert re.S == re.DOTALL
+ assert re.X == re.VERBOSE
+
+
+def test_flags():
+ for flag in [re.I, re.M, re.X, re.S, re.U]: # TODO: Add re.A back
+ assert re.compile("^pattern$", flag)
+ for flag in [re.I, re.M, re.X, re.S]: # TODO: Add re.A, re.L back
+ assert re.compile(b"^pattern$", flag)
+
+
+def test_character_set_errors():
+ check_pattern_error(r"[")
+ check_pattern_error(r"[^")
+ check_pattern_error(r"[a")
+ # bug 545855 -- This pattern failed to cause a compile error as it
+ # should, instead provoking a TypeError.
+ check_pattern_error(r"[a-")
+ check_pattern_error(r"[\w-b]")
+ check_pattern_error(r"[a-\w]")
+ check_pattern_error(r"[b-a]")
+
+
+def test_bug_113254():
+ assert re.match(r"(a)|(b)", "b").start(1) == -1
+ assert re.match(r"(a)|(b)", "b").end(1) == -1
+ assert re.match(r"(a)|(b)", "b").span(1) == (-1, -1)
+
+
+def test_bug_527371():
+ # bug described in patches 527371/672491
+ assert re.match(r"(a)?a", "a").lastindex is None
+ assert re.match(r"(a)(b)?b", "ab").lastindex == 1
+ assert re.match(r"(?P<a>a)(?P<b>b)?b", "ab").lastgroup == "a"
+ assert re.match(r"(?P<a>a(b))", "ab").lastgroup == "a"
+ assert re.match(r"((a))", "a").lastindex == 1
+
+
+def test_bug_418626():
+ # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
+ # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
+ # pattern '*?' on a long string.
+ assert re.match(".*?c", 10000 * "ab" + "cd").end(0) == 20001
+ assert re.match(".*?cd", 5000 * "ab" + "c" + 5000 * "ab" + "cde").end(0) == 20003
+ assert re.match(".*?cd", 20000 * "abc" + "de").end(0) == 60001
+ # non-simple '*?' still used to hit the recursion limit, before the
+ # non-recursive scheme was implemented.
+ assert re.search("(a|b)*?c", 10000 * "ab" + "cd", jit=False).end(0) == 20001
+
+
+def test_stack_overflow():
+ # nasty cases that used to overflow the straightforward recursive
+ # implementation of repeated groups.
+ assert re.match("(x)*", 50000 * "x").group(1) == "x"
+ assert re.match("(x)*y", 50000 * "x" + "y").group(1) == "x"
+ assert re.match("(x)*?y", 50000 * "x" + "y").group(1) == "x"
+
+
+def test_nothing_to_repeat():
+ for reps in "*", "+", "?", "{1,2}":
+ for mod in "", "?":
+ check_pattern_error("%s%s" % (reps, mod))
+ check_pattern_error("(?:%s%s)" % (reps, mod))
+
+
+def test_multiple_repeat():
+ for outer_reps in "*", "+", "?", "{1,2}":
+ for outer_mod in "", "?", "+":
+ outer_op = outer_reps + outer_mod
+ for inner_reps in "*", "+", "?", "{1,2}":
+ for inner_mod in "", "?", "+":
+ if inner_mod + outer_reps in ("?", "+"):
+ continue
+ inner_op = inner_reps + inner_mod
+ check_pattern_error(r"x%s%s" % (inner_op, outer_op))
+
+
+def test_unlimited_zero_width_repeat():
+ # Issue #9669
+ assert re.match(r"(?:a?)*y", "z") is None
+ assert re.match(r"(?:a?)+y", "z") is None
+ assert re.match(r"(?:a?){2,}y", "z") is None
+ assert re.match(r"(?:a?)*?y", "z") is None
+ assert re.match(r"(?:a?)+?y", "z") is None
+ assert re.match(r"(?:a?){2,}?y", "z") is None
+
+
+def test_bug_448951():
+ # bug 448951 (similar to 429357, but with single char match)
+ # (Also test greedy matches.)
+ for op in "", "?", "*":
+ assert re.match(r"((.%s):)?z" % op, "z").groups() == (None, None)
+ assert re.match(r"((.%s):)?z" % op, "a:z").groups() == ("a:", "a")
+
+
+def test_bug_725106():
+ # capturing groups in alternatives in repeats
+ assert re.match("^((a)|b)*", "abc").groups() == ("b", "a")
+ assert re.match("^(([ab])|c)*", "abc").groups() == ("c", "b")
+ assert re.match("^((d)|[ab])*", "abc").groups() == ("b", None)
+ assert re.match("^((a)c|[ab])*", "abc").groups() == ("b", None)
+ assert re.match("^((a)|b)*?c", "abc").groups() == ("b", "a")
+ assert re.match("^(([ab])|c)*?d", "abcd").groups() == ("c", "b")
+ assert re.match("^((d)|[ab])*?c", "abc").groups() == ("b", None)
+ assert re.match("^((a)c|[ab])*?c", "abc").groups() == ("b", None)
+
+
+def test_bug_725149():
+ # mark_stack_base restoring before restoring marks
+ assert re.match("(a)(?:(?=(b)*)c)*", "abb").groups() == ("a", None)
+ assert re.match("(a)((?!(b)*))*", "abb").groups() == ("a", None, None)
+
+
+def test_bug_764548():
+ # bug 764548, re.compile() barfs on str/unicode subclasses
+ class my_unicode(str):
+ pass
+
+ pat = re.compile(my_unicode("abc"))
+ assert pat.match("xyz") is None
+
+
+def test_finditer():
+ iter = re.finditer(r":+", "a:b::c:::d")
+ assert [item.group(0) for item in iter] == [":", "::", ":::"]
+
+ pat = re.compile(r":+")
+ iter = pat.finditer("a:b::c:::d", 1, 10)
+ assert [item.group(0) for item in iter] == [":", "::", ":::"]
+
+ pat = re.compile(r":+")
+ iter = pat.finditer("a:b::c:::d", pos=1, endpos=10)
+ assert [item.group(0) for item in iter] == [":", "::", ":::"]
+
+ pat = re.compile(r":+")
+ iter = pat.finditer("a:b::c:::d", endpos=10, pos=1)
+ assert [item.group(0) for item in iter] == [":", "::", ":::"]
+
+ pat = re.compile(r":+")
+ iter = pat.finditer("a:b::c:::d", pos=3, endpos=8)
+ assert [item.group(0) for item in iter] == ["::", "::"]
+
+
+def test_bug_926075():
+ assert re.compile("bug_926075") is not re.compile(b"bug_926075")
+
+
+def test_bug_931848():
+ pattern = "[\u002e\u3002\uff0e\uff61]"
+ assert re.compile(pattern).split("a.b.c") == ["a", "b", "c"]
+
+
+def test_bug_581080():
+ iter = re.finditer(r"\s", "a b")
+ assert next(iter).span() == (1, 2)
+ assert_raises(StopIteration, next, iter)
+
+
+def test_bug_817234():
+ iter = re.finditer(r".*", "asdf")
+ assert next(iter).span() == (0, 4)
+ assert next(iter).span() == (4, 4)
+ assert_raises(StopIteration, next, iter)
+
+
+def test_bug_6561():
+ # '\d' should match characters in Unicode category 'Nd'
+ # (Number, Decimal Digit), but not those in 'Nl' (Number,
+ # Letter) or 'No' (Number, Other).
+ decimal_digits = [
+ "\u0037", # '\N{DIGIT SEVEN}', category 'Nd'
+ "\u0e58", # '\N{THAI DIGIT SIX}', category 'Nd'
+ "\uff10", # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
+ ]
+ for x in decimal_digits:
+ assert re.match(r"^\d$", x).group(0) == x
+
+ not_decimal_digits = [
+ "\u2165", # '\N{ROMAN NUMERAL SIX}', category 'Nl'
+ "\u3039", # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
+ "\u2082", # '\N{SUBSCRIPT TWO}', category 'No'
+ "\u32b4", # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
+ ]
+ for x in not_decimal_digits:
+ assert re.match(r"^\d$", x) is None
+
+
+def test_inline_flags():
+ # Bug #1700
+ upper_char = "\u1ea0" # Latin Capital Letter A with Dot Below
+ lower_char = "\u1ea1" # Latin Small Letter A with Dot Below
+
+ p = re.compile("." + upper_char, re.I | re.S)
+ q = p.match("\n" + lower_char)
+ assert q
+
+ p = re.compile("." + lower_char, re.I | re.S)
+ q = p.match("\n" + upper_char)
+ assert q
+
+ p = re.compile("(?i)." + upper_char, re.S)
+ q = p.match("\n" + lower_char)
+ assert q
+
+ p = re.compile("(?i)." + lower_char, re.S)
+ q = p.match("\n" + upper_char)
+ assert q
+
+ p = re.compile("(?is)." + upper_char)
+ q = p.match("\n" + lower_char)
+ assert q
+
+ p = re.compile("(?is)." + lower_char)
+ q = p.match("\n" + upper_char)
+ assert q
+
+ p = re.compile("(?s)(?i)." + upper_char)
+ q = p.match("\n" + lower_char)
+ assert q
+
+ p = re.compile("(?s)(?i)." + lower_char)
+ q = p.match("\n" + upper_char)
+ assert q
+
+ assert re.match("(?ix) " + upper_char, lower_char)
+ assert re.match("(?ix) " + lower_char, upper_char)
+ assert re.match(" (?i) " + upper_char, lower_char, re.X)
+ assert re.match("(?x) (?i) " + upper_char, lower_char)
+ assert re.match(" (?x) (?i) " + upper_char, lower_char, re.X)
+
+
+def test_dollar_matches_twice():
+ r"""Test that $ does not include \n
+ $ matches the end of string, and just before the terminating \n"""
+ pattern = re.compile("$")
+ assert pattern.sub("#", "a\nb\n") == "a\nb#\n#"
+ assert pattern.sub("#", "a\nb\nc") == "a\nb\nc#"
+ assert pattern.sub("#", "\n") == "#\n#"
+
+ pattern = re.compile("$", re.MULTILINE)
+ assert pattern.sub("#", "a\nb\n") == "a#\nb#\n#"
+ assert pattern.sub("#", "a\nb\nc") == "a#\nb#\nc#"
+ assert pattern.sub("#", "\n") == "#\n#"
+
+
+def test_bytes_str_mixing():
+ # Mixing str and bytes is disallowed
+ pat = re.compile(".")
+ bpat = re.compile(b".")
+ assert_raises(TypeError, pat.match, b"b")
+ assert_raises(TypeError, bpat.match, "b")
+ assert_raises(TypeError, pat.sub, b"b", "c")
+ assert_raises(TypeError, pat.sub, "b", b"c")
+ assert_raises(TypeError, pat.sub, b"b", b"c")
+ assert_raises(TypeError, bpat.sub, b"b", "c")
+ assert_raises(TypeError, bpat.sub, "b", b"c")
+ assert_raises(TypeError, bpat.sub, "b", "c")
+
+
+def test_ascii_and_unicode_flag():
+ # String patterns
+ for flags in (0, re.UNICODE):
+ pat = re.compile("\xc0", flags | re.IGNORECASE)
+ assert pat.match("\xe0")
+ pat = re.compile(r"\w", flags)
+ assert pat.match("\xe0")
+ pat = re.compile(r"\w", re.ASCII)
+ assert pat.match("\xe0") is None
+ pat = re.compile(r"(?a)\w")
+ assert pat.match("\xe0") is None
+ # Bytes patterns
+ for flags in (0, re.ASCII):
+ pat = re.compile(b"\xc0", flags | re.IGNORECASE)
+ assert pat.match(b"\xe0") is None
+ pat = re.compile(rb"\w", flags)
+ assert pat.match(b"\xe0") is None
+ # Incompatibilities
+ check_pattern_error(rb"(?u)\w")
+ assert_raises(re.PatternError, re.compile, r"(?u)\w", re.ASCII)
+ check_pattern_error(r"(?au)\w")
+
+
+def test_scoped_flags():
+ assert re.match(r"(?i:a)b", "Ab")
+ assert re.match(r"(?i:a)b", "aB") is None
+ assert re.match(r"(?-i:a)b", "Ab", re.IGNORECASE) is None
+ assert re.match(r"(?-i:a)b", "aB", re.IGNORECASE)
+ assert re.match(r"(?i:(?-i:a)b)", "Ab") is None
+ assert re.match(r"(?i:(?-i:a)b)", "aB")
+ assert re.match(r"\w(?a:\W)\w", "\xe0\xe0\xe0")
+
+ check_pattern_error(rb"(?aL:a)")
+ check_pattern_error(r"(?-")
+ check_pattern_error(r"(?-+")
+ check_pattern_error(r"(?-z")
+ check_pattern_error(r"(?-i")
+ check_pattern_error(r"(?-i+")
+ check_pattern_error(r"(?-iz")
+ check_pattern_error(r"(?i:")
+ check_pattern_error(r"(?i")
+ check_pattern_error(r"(?i+")
+ check_pattern_error(r"(?iz")
+
+
+def test_ignore_spaces():
+ for space in " \t\n\r\v\f":
+ assert re.fullmatch(space + "a", "a", re.VERBOSE)
+ for space in b" ", b"\t", b"\n", b"\r", b"\v", b"\f":
+ assert re.fullmatch(space + b"a", b"a", re.VERBOSE)
+ assert re.fullmatch("(?x) a", "a")
+ assert re.fullmatch(" (?x) a", "a", re.VERBOSE)
+ assert re.fullmatch("(?x) (?x) a", "a")
+ assert re.fullmatch(" a(?x: b) c", " ab c")
+ assert re.fullmatch(" a(?-x: b) c", "a bc", re.VERBOSE)
+ assert re.fullmatch("(?x) a(?-x: b) c", "a bc")
+ assert re.fullmatch("(?x) a| b", "a")
+ assert re.fullmatch("(?x) a| b", "b")
+
+
+def test_comments():
+ assert re.fullmatch("#x\na", "a", re.VERBOSE)
+ assert re.fullmatch(b"#x\na", b"a", re.VERBOSE)
+ assert re.fullmatch("(?x)#x\na", "a")
+ assert re.fullmatch("#x\n(?x)#y\na", "a", re.VERBOSE)
+ assert re.fullmatch("(?x)#x\n(?x)#y\na", "a")
+ assert re.fullmatch("#x\na(?x:#y\nb)#z\nc", "#x\nab#z\nc")
+ assert re.fullmatch("#x\na(?-x:#y\nb)#z\nc", "a#y\nbc", re.VERBOSE)
+ assert re.fullmatch("(?x)#x\na(?-x:#y\nb)#z\nc", "a#y\nbc")
+ assert re.fullmatch("(?x)#x\na|#y\nb", "a")
+ assert re.fullmatch("(?x)#x\na|#y\nb", "b")
+
+
+def test_bug_6509():
+ # Replacement strings of both types must parse properly.
+ # all strings
+ assert re.sub(r"a(\w)", "b\\1", "ac") == "bc"
+ assert re.sub("a(.)", "b\\1", "a\u1234") == "b\u1234"
+ assert re.sub("..", lambda m: "str", "a5") == "str"
+
+ # all bytes
+ assert re.sub(rb"a(\w)", b"b\\1", b"ac") == b"bc"
+ assert re.sub(b"a(.)", b"b\\1", b"a\xcd") == b"b\xcd"
+ assert re.sub(b"..", lambda m: b"bytes", b"a5") == b"bytes"
+
+
+def test_search_dot_unicode():
+ assert re.search("123.*-", "123abc-")
+ assert re.search("123.*-", "123\xe9-")
+ assert re.search("123.*-", "123\u20ac-")
+ assert re.search("123.*-", "123\U0010ffff-")
+ assert re.search("123.*-", "123\xe9\u20ac\U0010ffff-")
+
+
+def test_compile():
+ # Test return value when given string and pattern as parameter
+ pattern = re.compile("random pattern")
+ assert isinstance(pattern, re.Pattern)
+ same_pattern = re.compile(pattern)
+ assert isinstance(same_pattern, re.Pattern)
+ assert same_pattern is pattern
+ # Test behaviour when not given a string or pattern as parameter
+ assert_raises(TypeError, re.compile, 0)
+
+
+def test_large_search():
+ # Issue #10182: indices were 32-bit-truncated.
+ size = 2 # * 1024 ** 2 # TODO: Works but is expensive for iterative tests
+ s = "a" * size
+ m = re.search("$", s)
+ assert m is not None
+ assert m.start() == size
+ assert m.end() == size
+
+
+def test_large_subn():
+ # Issue #10182: indices were 32-bit-truncated.
+ size = 2 # * 1024 ** 2 # TODO: Works but is expensive for iterative tests
+ s = "a" * size
+ r, n = re.subn("", "", s)
+ assert r == s
+ assert n == size + 1
+
+
+def test_bug_16688():
+ # Issue 16688: Backreferences make case-insensitive regex fail on
+ # non-ASCII strings.
+ assert re.findall(r"(?i)(a)\1", "aa \u0100") == ["a"]
+ assert re.match(r"(?s).{1,3}", "\u0100\u0100").span() == (0, 2)
+
+
+def test_repeat_minmax_overflow():
+ # Issue #13169
+ string = "x" * 100000
+ assert re.match(r".{65535}", string).span() == (0, 65535)
+ assert re.match(r".{,65535}", string).span() == (0, 65535)
+ assert re.match(r".{65535,}?", string).span() == (0, 65535)
+
+
+def test_look_behind_overflow():
+ string = "x" * 2_500_000
+ p1 = r"(?<=((.{%d}){%d}){%d})"
+ p2 = r"(?<!((.{%d}){%d}){%d})"
+ # But 2**66 is too large for look-behind width.
+ assert_raises(re.error, re.compile, p1 % (2**22, 2**22, 2**22))
+ assert_raises(re.error, re.compile, p2 % (2**22, 2**22, 2**22))
+
+
+def test_issue17998():
+ for reps in "*", "+", "?", "{1}":
+ for mod in "", "?":
+ pattern = "." + reps + mod + "yz"
+ assert re.compile(pattern, re.S).findall("xyz") == ["xyz"]
+ pattern = pattern.encode()
+ assert re.compile(pattern, re.S).findall(b"xyz") == [b"xyz"]
+
+
+def test_match_repr():
+ for string in "[abracadabra]", S("[abracadabra]"):
+ m = re.search(r"(.+)(.*?)\1", string)
+ pattern = r"<(%s\.)?%s object; span=\(1, 12\), match='abracadabra'>" % (
+ type(m).__module__,
+ type(m).__qualname__,
+ )
+ assert re.search(pattern, repr(m))
+ for string in (
+ b"[abracadabra]",
+ B(b"[abracadabra]"),
+ bytearray(b"[abracadabra]"),
+ memoryview(b"[abracadabra]"),
+ ):
+ m = re.search(rb"(.+)(.*?)\1", string)
+ pattern = r"<(%s\.)?%s object; span=\(1, 12\), match=b'abracadabra'>" % (
+ type(m).__module__,
+ type(m).__qualname__,
+ )
+ assert re.search(pattern, repr(m))
+
+ first, second = list(re.finditer("(aa)|(bb)", "aa bb"))
+ pattern = r"<(%s\.)?%s object; span=\(0, 2\), match='aa'>" % (
+ type(second).__module__,
+ type(second).__qualname__,
+ )
+ assert re.search(pattern, repr(first))
+ pattern = r"<(%s\.)?%s object; span=\(3, 5\), match='bb'>" % (
+ type(second).__module__,
+ type(second).__qualname__,
+ )
+ assert re.search(pattern, repr(second))
+
+
+def test_zerowidth():
+ # Issues 852532, 1647489, 3262, 25054.
+ assert re.split(r"\b", "a::bc") == ["", "a", "::", "bc", ""]
+ assert re.split(r"\b|:+", "a::bc") == ["", "a", "", "", "bc", ""]
+ assert re.split(r"(?<!\w)(?=\w)|:+", "a::bc") == ["", "a", "", "bc"]
+ assert re.split(r"(?<=\w)(?!\w)|:+", "a::bc") == ["a", "", "bc", ""]
+
+ assert re.sub(r"\b", "-", "a::bc") == "-a-::-bc-"
+ assert re.sub(r"\b|:+", "-", "a::bc") == "-a---bc-"
+ assert re.sub(r"(\b|:+)", r"[\1]", "a::bc") == "[]a[][::][]bc[]"
+
+ assert re.findall(r"\b|:+", "a::bc") == ["", "", "::", "", ""]
+ assert re.findall(r"\b|\w+", "a::bc") == ["", "a", "", "", "bc", ""]
+
+ assert [m.span() for m in re.finditer(r"\b|:+", "a::bc")] == [
+ (0, 0),
+ (1, 1),
+ (1, 3),
+ (3, 3),
+ (5, 5),
+ ]
+ assert [m.span() for m in re.finditer(r"\b|\w+", "a::bc")] == [
+ (0, 0),
+ (0, 1),
+ (1, 1),
+ (3, 3),
+ (3, 5),
+ (5, 5),
+ ]
+
+
+def test_bug_2537():
+ # issue 2537: empty submatches
+ for outer_op in ("{0,}", "*", "+", "{1,187}"):
+ for inner_op in ("{0,}", "*", "?"):
+ r = re.compile("^((x|y)%s)%s" % (inner_op, outer_op))
+ m = r.match("xyyzy")
+ assert m.group(0) == "xyy"
+ assert m.group(1) == ""
+ assert m.group(2) == "y"
+
+
+def test_keyword_parameters():
+ # Issue #20283: Accepting the string keyword parameter.
+ pat = re.compile(r"(ab)")
+ assert pat.match(string="abracadabra", pos=7, endpos=10).span() == (7, 9)
+ assert pat.fullmatch(string="abracadabra", pos=7, endpos=9).span() == (7, 9)
+ assert pat.search(string="abracadabra", pos=3, endpos=10).span() == (7, 9)
+ assert pat.findall(string="abracadabra", pos=3, endpos=10) == ["ab"]
+ assert pat.split(string="abracadabra", maxsplit=1) == ["", "ab", "racadabra"]
+
+
+def test_bug_20998():
+ # Issue #20998: Fullmatch of repeated single character pattern
+ # with ignore case.
+ assert re.fullmatch("[a-c]+", "ABC", re.I).span() == (0, 3)
+
+
+def test_misc_errors():
+ check_pattern_error(r"(")
+ check_pattern_error(r"((a|b)")
+ check_pattern_error(r"(a|b))")
+ check_pattern_error(r"(?P")
+ check_pattern_error(r"(?z)")
+ check_pattern_error(r"(?iz)")
+ check_pattern_error(r"(?i")
+ check_pattern_error(r"(?#abc")
+ check_pattern_error(r"(?<")
+ check_pattern_error(r"(?<>)")
+ check_pattern_error(r"(?")
+
+
+def test_enum():
+ # Issue #28082: Check that str(flag) returns a human readable string
+ # instead of an integer
+ # TODO: Change representation of enums
+ # self.assertIn("IGNORECASE", str(re.I))
+ # self.assertIn("DOTALL", str(re.S))
+ pass
+
+
+def test_bug_34294():
+ # Issue 34294: wrong capturing groups
+ # exists since Python 2
+ s = "a\tx"
+ p = r"\b(?=(\t)|(x))x"
+ assert re.search(p, s).groups() == (None, "x")
+
+ # introduced in Python 3.7.0
+ s = "ab"
+ p = r"(?=(.)(.)?)"
+ assert re.findall(p, s), [("a", "b") == ("b", "")]
+ assert [m.groups() for m in re.finditer(p, s)], [("a", "b") == ("b", None)]
+
+ # test-cases provided by issue34294, introduced in Python 3.7.0
+ p = r"(?=<(?P<tag>\w+)/?>(?:(?P<text>.+?)</(?P=tag)>)?)"
+ s = "<test><foo2/></test>"
+ assert re.findall(p, s), [("test", "<foo2/>") == ("foo2", "")]
+ assert [m.groupdict() for m in re.finditer(p, s)] == [
+ {"tag": "test", "text": "<foo2/>"},
+ {"tag": "foo2", "text": None},
+ ]
+ s = "<test>Hello</test><foo/>"
+ assert [m.groupdict() for m in re.finditer(p, s)] == [
+ {"tag": "test", "text": "Hello"},
+ {"tag": "foo", "text": None},
+ ]
+ s = "<test>Hello</test><foo/><foo/>"
+ assert [m.groupdict() for m in re.finditer(p, s)] == [
+ {"tag": "test", "text": "Hello"},
+ {"tag": "foo", "text": None},
+ {"tag": "foo", "text": None},
+ ]
+
+
+def test_MARK_PUSH_macro_bug():
+ # issue35859, MARK_PUSH() macro didn't protect MARK-0 if it
+ # was the only available mark.
+ assert re.match(r"(ab|a)*?b", "ab").groups() == ("a",)
+ assert re.match(r"(ab|a)+?b", "ab").groups() == ("a",)
+ assert re.match(r"(ab|a){0,2}?b", "ab").groups() == ("a",)
+ assert re.match(r"(.b|a)*?b", "ab").groups() == ("a",)
+
+
+def test_MIN_UNTIL_mark_bug():
+ # Fixed in issue35859, reported in issue9134.
+ # JUMP_MIN_UNTIL_2 should MARK_PUSH() if in a repeat
+ s = "axxzbcz"
+ p = r"(?:(?:a|bc)*?(xx)??z)*"
+ assert re.match(p, s).groups() == ("xx",)
+
+ # test-case provided by issue9134
+ s = "xtcxyzxc"
+ p = r"((x|yz)+?(t)??c)*"
+ m = re.match(p, s)
+ assert m.span() == (0, 8)
+ assert m.span(2) == (6, 7)
+ assert m.groups() == ("xyzxc", "x", "t")
+
+
+def test_REPEAT_ONE_mark_bug():
+ # issue35859
+ # JUMP_REPEAT_ONE_1 should MARK_PUSH() if in a repeat
+ s = "aabaab"
+ p = r"(?:[^b]*a(?=(b)|(a))ab)*"
+ m = re.match(p, s)
+ assert m.span() == (0, 6)
+ assert m.span(2) == (4, 5)
+ assert m.groups() == (None, "a")
+
+ # JUMP_REPEAT_ONE_2 should MARK_PUSH() if in a repeat
+ s = "abab"
+ p = r"(?:[^b]*(?=(b)|(a))ab)*"
+ m = re.match(p, s)
+ assert m.span() == (0, 4)
+ assert m.span(2) == (2, 3)
+ assert m.groups() == (None, "a")
+
+ assert re.match(r"(ab?)*?b", "ab").groups() == ("a",)
+
+
+def test_MIN_REPEAT_ONE_mark_bug():
+ # issue35859
+ # JUMP_MIN_REPEAT_ONE should MARK_PUSH() if in a repeat
+ s = "abab"
+ p = r"(?:.*?(?=(a)|(b))b)*"
+ m = re.match(p, s)
+ assert m.span() == (0, 4)
+ assert m.span(2) == (3, 4)
+ assert m.groups() == (None, "b")
+
+ s = "axxzaz"
+ p = r"(?:a*?(xx)??z)*"
+ assert re.match(p, s).groups() == ("xx",)
+
+
+def test_ASSERT_NOT_mark_bug():
+ # Fixed in issue35859, reported in issue725149.
+ # JUMP_ASSERT_NOT should LASTMARK_SAVE()
+ assert re.match(r"(?!(..)c)", "ab").groups() == (None,)
+
+ # JUMP_ASSERT_NOT should MARK_PUSH() if in a repeat
+ m = re.match(r"((?!(ab)c)(.))*", "abab")
+ assert m.span() == (0, 4)
+ assert m.span(1) == (3, 4)
+ assert m.span(3) == (3, 4)
+ assert m.groups() == ("b", None, "b")
+
+
+def test_bug_40736():
+ with pytest.raises(TypeError):
+ re.search("x*", 5)
+ with pytest.raises(TypeError):
+ re.search("x*", type)
+
+
+def test_search_anchor_at_beginning():
+ s = "x" * 10**7
+ for p in r"\Ay", r"^y":
+ assert re.search(p, s) is None
+ assert re.split(p, s) == [s]
+ assert re.findall(p, s) == []
+ assert list(re.finditer(p, s)) == []
+ assert re.sub(p, "", s) == s
+
+
+def test_possessive_quantifiers():
+ """Test Possessive Quantifiers
+ Test quantifiers of the form @+ for some repetition operator @,
+ e.g. x{3,5}+ meaning match from 3 to 5 greadily and proceed
+ without creating a stack frame for rolling the stack back and
+ trying 1 or more fewer matches."""
+ assert re.match("e*+e", "eeee") is None
+ assert re.match("e++a", "eeea").group(0) == "eeea"
+ assert re.match("e?+a", "ea").group(0) == "ea"
+ assert re.match("e{2,4}+a", "eeea").group(0) == "eeea"
+ assert re.match("(.)++.", "ee") is None
+ assert re.match("(ae)*+a", "aea").groups() == ("ae",)
+ assert re.match("([ae][ae])?+a", "aea").groups() == ("ae",)
+ assert re.match("(e?){2,4}+a", "eeea").groups() == ("",)
+ assert re.match("()*+a", "a").groups() == ("",)
+ assert re.search("x*+", "axx").span() == (0, 0)
+ assert re.search("x++", "axx").span() == (1, 3)
+ assert re.match("a*+", "xxx").span() == (0, 0)
+ assert re.match("x*+", "xxxa").span() == (0, 3)
+ assert re.match("a++", "xxx") is None
+ assert re.match(r"^(\w){1}+$", "abc") is None
+ assert re.match(r"^(\w){1,2}+$", "abc") is None
+
+ assert re.match(r"^(\w){3}+$", "abc").group(1) == "c"
+ assert re.match(r"^(\w){1,3}+$", "abc").group(1) == "c"
+ assert re.match(r"^(\w){1,4}+$", "abc").group(1) == "c"
+
+ assert re.match("^x{1}+$", "xxx") is None
+ assert re.match("^x{1,2}+$", "xxx") is None
+
+ assert re.match("^x{3}+$", "xxx")
+ assert re.match("^x{1,3}+$", "xxx")
+ assert re.match("^x{1,4}+$", "xxx")
+
+ assert re.match("^x{}+$", "xxx") is None
+ assert re.match("^x{}+$", "x{}")
+
+
+def test_fullmatch_possessive_quantifiers():
+ assert re.fullmatch(r"a++", "a")
+ assert re.fullmatch(r"a*+", "a")
+ assert re.fullmatch(r"a?+", "a")
+ assert re.fullmatch(r"a{1,3}+", "a")
+ assert re.fullmatch(r"a++", "ab") is None
+ assert re.fullmatch(r"a*+", "ab") is None
+ assert re.fullmatch(r"a?+", "ab") is None
+ assert re.fullmatch(r"a{1,3}+", "ab") is None
+ assert re.fullmatch(r"a++b", "ab")
+ assert re.fullmatch(r"a*+b", "ab")
+ assert re.fullmatch(r"a?+b", "ab")
+ assert re.fullmatch(r"a{1,3}+b", "ab")
+
+ assert re.fullmatch(r"(?:ab)++", "ab")
+ assert re.fullmatch(r"(?:ab)*+", "ab")
+ assert re.fullmatch(r"(?:ab)?+", "ab")
+ assert re.fullmatch(r"(?:ab){1,3}+", "ab")
+ assert re.fullmatch(r"(?:ab)++", "abc") is None
+ assert re.fullmatch(r"(?:ab)*+", "abc") is None
+ assert re.fullmatch(r"(?:ab)?+", "abc") is None
+ assert re.fullmatch(r"(?:ab){1,3}+", "abc") is None
+ assert re.fullmatch(r"(?:ab)++c", "abc")
+ assert re.fullmatch(r"(?:ab)*+c", "abc")
+ assert re.fullmatch(r"(?:ab)?+c", "abc")
+ assert re.fullmatch(r"(?:ab){1,3}+c", "abc")
+
+
+def test_findall_possessive_quantifiers():
+ assert re.findall(r"a++", "aab") == ["aa"]
+ assert re.findall(r"a*+", "aab") == ["aa", "", ""]
+ assert re.findall(r"a?+", "aab") == ["a", "a", "", ""]
+ assert re.findall(r"a{1,3}+", "aab") == ["aa"]
+
+ assert re.findall(r"(?:ab)++", "ababc") == ["abab"]
+ assert re.findall(r"(?:ab)*+", "ababc") == ["abab", "", ""]
+ assert re.findall(r"(?:ab)?+", "ababc") == ["ab", "ab", "", ""]
+ assert re.findall(r"(?:ab){1,3}+", "ababc") == ["abab"]
+
+
+def test_atomic_grouping():
+ """Test Atomic Grouping
+ Test non-capturing groups of the form (?>...), which does
+ not maintain any stack point created within the group once the
+ group is finished being evaluated."""
+ pattern1 = re.compile(r"a(?>bc|b)c")
+ assert pattern1.match("abc") is None
+ assert pattern1.match("abcc")
+ assert re.match(r"(?>.*).", "abc") is None
+ assert re.match(r"(?>x)++", "xxx")
+ assert re.match(r"(?>x++)", "xxx")
+ assert re.match(r"(?>x)++x", "xxx") is None
+ assert re.match(r"(?>x++)x", "xxx") is None
+
+
+def test_fullmatch_atomic_grouping():
+ assert re.fullmatch(r"(?>a+)", "a")
+ assert re.fullmatch(r"(?>a*)", "a")
+ assert re.fullmatch(r"(?>a?)", "a")
+ assert re.fullmatch(r"(?>a{1,3})", "a")
+ assert re.fullmatch(r"(?>a+)", "ab") is None
+ assert re.fullmatch(r"(?>a*)", "ab") is None
+ assert re.fullmatch(r"(?>a?)", "ab") is None
+ assert re.fullmatch(r"(?>a{1,3})", "ab") is None
+ assert re.fullmatch(r"(?>a+)b", "ab")
+ assert re.fullmatch(r"(?>a*)b", "ab")
+ assert re.fullmatch(r"(?>a?)b", "ab")
+ assert re.fullmatch(r"(?>a{1,3})b", "ab")
+
+ assert re.fullmatch(r"(?>(?:ab)+)", "ab")
+ assert re.fullmatch(r"(?>(?:ab)*)", "ab")
+ assert re.fullmatch(r"(?>(?:ab)?)", "ab")
+ assert re.fullmatch(r"(?>(?:ab){1,3})", "ab")
+ assert re.fullmatch(r"(?>(?:ab)+)", "abc") is None
+ assert re.fullmatch(r"(?>(?:ab)*)", "abc") is None
+ assert re.fullmatch(r"(?>(?:ab)?)", "abc") is None
+ assert re.fullmatch(r"(?>(?:ab){1,3})", "abc") is None
+ assert re.fullmatch(r"(?>(?:ab)+)c", "abc")
+ assert re.fullmatch(r"(?>(?:ab)*)c", "abc")
+ assert re.fullmatch(r"(?>(?:ab)?)c", "abc")
+ assert re.fullmatch(r"(?>(?:ab){1,3})c", "abc")
+
+
+def test_findall_atomic_grouping():
+ assert re.findall(r"(?>a+)", "aab") == ["aa"]
+ assert re.findall(r"(?>a*)", "aab") == ["aa", "", ""]
+ assert re.findall(r"(?>a?)", "aab") == ["a", "a", "", ""]
+ assert re.findall(r"(?>a{1,3})", "aab") == ["aa"]
+
+ assert re.findall(r"(?>(?:ab)+)", "ababc") == ["abab"]
+ assert re.findall(r"(?>(?:ab)*)", "ababc") == ["abab", "", ""]
+ assert re.findall(r"(?>(?:ab)?)", "ababc") == ["ab", "ab", "", ""]
+ assert re.findall(r"(?>(?:ab){1,3})", "ababc") == ["abab"]
+
+
+def test_bug_gh91616():
+ assert re.fullmatch(r"(?s:(?>.*?\.).*)\z", "a.txt") # reproducer
+ assert re.fullmatch(r"(?s:(?=(?P<g0>.*?\.))(?P=g0).*)\z", "a.txt")
+
+
+def test_bug_gh100061():
+ # gh-100061
+ assert re.match("(?>(?:.(?!D))+)", "ABCDE").span() == (0, 2)
+ assert re.match("(?:.(?!D))++", "ABCDE").span() == (0, 2)
+ assert re.match("(?>(?:.(?!D))*)", "ABCDE").span() == (0, 2)
+ assert re.match("(?:.(?!D))*+", "ABCDE").span() == (0, 2)
+ assert re.match("(?>(?:.(?!D))?)", "CDE").span() == (0, 0)
+ assert re.match("(?:.(?!D))?+", "CDE").span() == (0, 0)
+ assert re.match("(?>(?:.(?!D)){1,3})", "ABCDE").span() == (0, 2)
+ assert re.match("(?:.(?!D)){1,3}+", "ABCDE").span() == (0, 2)
+ # gh-106052
+ assert re.match("(?>(?:ab?c)+)", "aca").span() == (0, 2)
+ assert re.match("(?:ab?c)++", "aca").span() == (0, 2)
+ assert re.match("(?>(?:ab?c)*)", "aca").span() == (0, 2)
+ assert re.match("(?:ab?c)*+", "aca").span() == (0, 2)
+ assert re.match("(?>(?:ab?c)?)", "a").span() == (0, 0)
+ assert re.match("(?:ab?c)?+", "a").span() == (0, 0)
+ assert re.match("(?>(?:ab?c){1,3})", "aca").span() == (0, 2)
+ assert re.match("(?:ab?c){1,3}+", "aca").span() == (0, 2)
+
+
+def test_bug_gh101955():
+ # Possessive quantifier with nested alternative with capture groups
+ assert re.match("((x)|y|z)*+", "xyz").groups() == ("z", "x")
+ assert re.match("((x)|y|z){3}+", "xyz").groups() == ("z", "x")
+ assert re.match("((x)|y|z){3,}+", "xyz").groups() == ("z", "x")
+
+
+def test_regression_gh94675():
+ # TODO: Multiprocessing requires pickling
+ pattern = re.compile(
+ r"(?<=[({}])(((//[^\n]*)?[\n])([\000-\040])*)*"
+ r"((/[^/\[\n]*(([^\n]|(\[\n]*(]*)*\]))"
+ r"[^/\[]*)*/))((((//[^\n]*)?[\n])"
+ r"([\000-\040]|(/\*[^*]*\*+"
+ r"([^/*]\*+)*/))*)+(?=[^\000-\040);\]}]))"
+ )
+ input_js = """a(function() {
+ ///////////////////////////////////////////////////////////////////
+ });"""
+ p = multiprocessing.Process(target=pattern.sub, args=("", input_js))
+ p.start()
+ p.join(30.0)
+ try:
+ assert not p.is_alive(), "pattern.sub() timed out"
+ finally:
+ if p.is_alive():
+ p.terminate()
+ p.join()
+
+
+def test_fail():
+ assert re.search(r"12(?!)|3", "123")[0] == "3"
+
+
+def test_character_set_any():
+ # The union of complementary character sets matches any character
+ # and is equivalent to "(?s:.)".
+ s = "1x\n"
+ for p in r"[\s\S]", r"[\d\D]", r"[\w\W]", r"[\S\s]", r"\s|\S":
+ assert re.findall(p, s) == list(s)
+ assert re.fullmatch("(?:" + p + ")+", s).group() == s
+
+
+def test_character_set_none():
+ # Negation of the union of complementary character sets does not match
+ # any character.
+ s = "1x\n"
+ for p in r"[^\s\S]", r"[^\d\D]", r"[^\w\W]", r"[^\S\s]":
+ assert re.search(p, s) is None
+ assert re.search("(?s:.)" + p, s) is None